From 36750a576371758d6777e8788b0a71fb4d47549e Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 22 Mar 2023 08:05:48 -0700
Subject: [PATCH 01/71] Get rid of XDL parameters in WMMA kernel string. (#646)

* remove XDL parameters from WMMA kernel string

* get rid f two more parameters
---
 ...vice_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
index a1ee1902b..9d4b68c0b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
@@ -840,17 +840,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
             << KPerBlock << ", "
             << getConvForwardSpecializationString(ConvForwardSpecialization) << ", "
             << K1 << ", "
-            << MPerXDL << ", "
-            << NPerXDL << ", "
-            << MXdlPerWave << ", "
-            << NXdlPerWave << ", "
             << ABlockTransferSrcScalarPerVector << ", "
-            << ABlockTransferDstScalarPerVector_K1 << ", "
-            << BBlockTransferSrcScalarPerVector << ", "
-            << BBlockTransferDstScalarPerVector_K1 << ", "
-            << CShuffleMXdlPerWavePerShuffle << ", "
-            << CShuffleNXdlPerWavePerShuffle << ", "
-            << CBlockTransferScalarPerVector_NWaveNPerXdl
+            << BBlockTransferSrcScalarPerVector
             << ">";
         // clang-format on
 
-- 
GitLab


From fe96e8fbf2503e14574e429dde61546051f3e4dc Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Thu, 23 Mar 2023 01:49:11 +0800
Subject: [PATCH 02/71] Reduce group & batch of the tested convolutions (#648)

---
 .../grouped_convnd_bwd_weight.cpp             | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp b/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
index 6545b6e56..75f934cc0 100644
--- a/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
@@ -43,7 +43,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
                 DataType,
                 DataType,
                 DataType>(true,  // do_verification
-                          1,     // init_method integer value
+                          1,     // init_method: integer value
                           false, // do_log
                           false, // time_kernel
                           param,
@@ -60,9 +60,9 @@ TYPED_TEST_SUITE(TestGroupedConvndBwdWeight, KernelTypes);
 TYPED_TEST(TestGroupedConvndBwdWeight, Test1D)
 {
     this->conv_params.clear();
-    this->conv_params.push_back({1, 4, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}});
-    this->conv_params.push_back({1, 4, 64, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}});
-    this->conv_params.push_back({1, 4, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}});
+    this->conv_params.push_back({1, 2, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}});
+    this->conv_params.push_back({1, 2, 32, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}});
+    this->conv_params.push_back({1, 2, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}});
     this->template Run<1>();
 }
 
@@ -70,11 +70,11 @@ TYPED_TEST(TestGroupedConvndBwdWeight, Test2D)
 {
     this->conv_params.clear();
     this->conv_params.push_back(
-        {2, 4, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+        {2, 2, 64, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
     this->conv_params.push_back(
-        {2, 4, 8, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+        {2, 2, 4, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
     this->conv_params.push_back(
-        {2, 4, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+        {2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
     this->template Run<2>();
 }
 
@@ -82,10 +82,10 @@ TYPED_TEST(TestGroupedConvndBwdWeight, Test3D)
 {
     this->conv_params.clear();
     this->conv_params.push_back(
-        {3, 4, 128, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+        {3, 2, 16, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
     this->conv_params.push_back(
-        {3, 4, 8, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+        {3, 2, 2, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->conv_params.push_back(
-        {3, 4, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+        {3, 2, 32, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
     this->template Run<3>();
 }
-- 
GitLab


From e5376be4acc6fb8554c5ff5430b8f2750bc939c9 Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Fri, 24 Mar 2023 00:22:10 +0800
Subject: [PATCH 03/71] [Navi3x] Fix Gridwise_multiple_d operation (#649)

* Add CMake Option "USE_OPT_NAVI3X"

* fix bug
---
 ...n_grouped_conv_fwd_bias_relu_add_wmma_example.inc |  4 ++--
 .../grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp  | 12 ++++++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
index 8161b1088..a6888649c 100644
--- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
@@ -74,8 +74,8 @@ using DeviceConvFwdInstance =
         8,           // BBlockTransferSrcScalarPerVector
         8,           // BBlockTransferDstScalarPerVector_BK1
         true,        // BBlockLdsExtraN
-        1,
-        1,
+        4,
+        2,
         S<1, 32, 1, 8>,
         8>;
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
index 1e8f8ff9f..38edace19 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -431,6 +431,9 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle
         constexpr auto b_block_desc_k0perblock_nperblock_k1 =
             GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
 
+        constexpr auto cshuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
+            GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat();
+
         constexpr auto max_lds_align = K1;
 
         constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
@@ -439,8 +442,13 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle
         constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
             b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize(), max_lds_align);
 
-        return (a_block_space_size_aligned * sizeof(ADataType) +
-                b_block_space_size_aligned * sizeof(BDataType));
+        constexpr auto c_block_space_size_aligned = math::integer_least_multiple(
+            cshuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat.GetElementSpaceSize(),
+            max_lds_align);
+
+        return math::max((a_block_space_size_aligned * sizeof(ADataType) +
+                          b_block_space_size_aligned * sizeof(BDataType)),
+                         c_block_space_size_aligned * sizeof(CShuffleDataType));
     }
 
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
-- 
GitLab


From f80776d937b9009654a7cb6e1d3f9584cd7b6e41 Mon Sep 17 00:00:00 2001
From: Sam Wu <sjwu@ualberta.ca>
Date: Thu, 23 Mar 2023 21:58:59 -0600
Subject: [PATCH 04/71] standardize docs (#655)

---
 .gitignore                                    |  11 +-
 .readthedocs.yaml                             |  18 ++
 README.md                                     |  14 +-
 doc/markdown/dockerhub.md                     |  93 ------
 doc/markdown/tutorial_hello_world.md          | 191 -------------
 docs/{ => .doxygen}/Doxyfile                  |  10 +-
 docs/.sphinx/_toc.yml.in                      |   1 +
 docs/.sphinx/requirements.in                  |   1 +
 docs/.sphinx/requirements.txt                 | 269 ++++++++++++++++++
 docs/{source => }/API_Reference_Guide.rst     |   2 +-
 docs/{source => }/Contributors_Guide.rst      |   0
 .../Supported_Primitives_Guide.rst            |   2 +-
 docs/conf.py                                  |  24 ++
 {doc/image => docs/data}/ck_component.png     | Bin
 {doc/image => docs/data}/ck_layer.png         | Bin
 docs/{source => }/dockerhub.rst               |   0
 docs/index.rst                                |  52 ++++
 docs/{source => }/refs.bib                    |   0
 docs/run_doc.sh                               |  15 -
 docs/run_doxygen.sh                           |  10 -
 docs/source/Disclaimer.rst                    |  13 -
 docs/source/Linux_Install_Guide.rst           |  15 -
 docs/source/Makefile                          |  20 --
 docs/source/conf.py                           | 219 --------------
 docs/source/index.rst                         |  16 --
 docs/source/rocm_logo.png                     | Bin 355437 -> 0 bytes
 docs/{source => }/tutorial_hello_world.rst    |   0
 27 files changed, 392 insertions(+), 604 deletions(-)
 create mode 100644 .readthedocs.yaml
 delete mode 100644 doc/markdown/dockerhub.md
 delete mode 100644 doc/markdown/tutorial_hello_world.md
 rename docs/{ => .doxygen}/Doxyfile (99%)
 create mode 100644 docs/.sphinx/_toc.yml.in
 create mode 100644 docs/.sphinx/requirements.in
 create mode 100644 docs/.sphinx/requirements.txt
 rename docs/{source => }/API_Reference_Guide.rst (98%)
 rename docs/{source => }/Contributors_Guide.rst (100%)
 rename docs/{source => }/Supported_Primitives_Guide.rst (99%)
 create mode 100644 docs/conf.py
 rename {doc/image => docs/data}/ck_component.png (100%)
 rename {doc/image => docs/data}/ck_layer.png (100%)
 rename docs/{source => }/dockerhub.rst (100%)
 create mode 100644 docs/index.rst
 rename docs/{source => }/refs.bib (100%)
 delete mode 100755 docs/run_doc.sh
 delete mode 100755 docs/run_doxygen.sh
 delete mode 100644 docs/source/Disclaimer.rst
 delete mode 100644 docs/source/Linux_Install_Guide.rst
 delete mode 100644 docs/source/Makefile
 delete mode 100644 docs/source/conf.py
 delete mode 100644 docs/source/index.rst
 delete mode 100644 docs/source/rocm_logo.png
 rename docs/{source => }/tutorial_hello_world.rst (100%)

diff --git a/.gitignore b/.gitignore
index 5667695bb..362fb9e2e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,6 +48,11 @@ build*
 .gdb_history
 install.dir*
 
-# directories containing generated documentation
-docs/source/_build/
-docs/docBin/
+# documentation artifacts
+build/
+_build/
+_images/
+_static/
+_templates/
+_toc.yml
+docBin/
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 000000000..b73953683
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,18 @@
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+version: 2
+
+build:
+   os: ubuntu-22.04
+   tools:
+      python: "3.8"
+
+sphinx:
+   configuration: docs/conf.py
+
+formats: [htmlzip]
+
+python:
+   install:
+   - requirements: docs/.sphinx/requirements.txt
diff --git a/README.md b/README.md
index 151da974a..04199f11b 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ CK utilizes two concepts to achieve performance portability and code maintainabi
 * A tile-based programming model
 * Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".
 
-![ALT](/doc/image/ck_component.png "CK Components")
+![ALT](/docs/data/ck_component.png "CK Components")
 
 ## Code Structure
 Current CK library are structured into 4 layers:
@@ -16,7 +16,17 @@ Current CK library are structured into 4 layers:
 * "Instantiated Kernel and Invoker" layer
 * "Client API" layer
 
-![ALT](/doc/image/ck_layer.png "CK Layers")
+![ALT](/docs/data/ck_layer.png "CK Layers")
+
+## Documentation
+
+Run the steps below to build documentation locally.
+
+```
+cd docs
+pip3 install -r .sphinx/requirements.txt
+python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
+```
 
 ## Contributors
 The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md)
diff --git a/doc/markdown/dockerhub.md b/doc/markdown/dockerhub.md
deleted file mode 100644
index 91b6cb229..000000000
--- a/doc/markdown/dockerhub.md
+++ /dev/null
@@ -1,93 +0,0 @@
-## CK docker hub
-
-[Docker hub](https://hub.docker.com/r/rocm/composable_kernel)
-
-## Why do I need this?
-
-To make our lives easier and bring Composable Kernel dependencies together, we recommend using docker images.
-
-## So what is Composable Kernel?
-
-Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
-
-To get the CK library
-
-```
-git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git
-```
-
-run a docker container 
-
-```
-docker run                                                            \
--it                                                                   \
---privileged                                                          \
---group-add sudo                                                      \
--w /root/workspace                                                    \
--v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                         \
-rocm/composable_kernel:ck_ub20.04_rocm5.3_release                     \
-/bin/bash
-```
-
-and build the CK
-
-```
-mkdir build && cd build
-
-# Need to specify target ID, example below is for gfx908 and gfx90a
-cmake                                                                                             \
--D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
--D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
--D CMAKE_CXX_FLAGS="-O3"                                                                          \
--D CMAKE_BUILD_TYPE=Release                                                                       \
--D GPU_TARGETS="gfx908;gfx90a"                                                                    \
-..
-```
-
-and 
-
-```
-make -j examples tests
-```
-
-To run all the test cases including tests and examples run
-
-```
-make test
-```
-
-We can also run specific examples or tests like
-
-```
-./bin/example_gemm_xdl_fp16
-./bin/test_gemm_fp16
-```
-
-For more details visit [CK github repo](https://github.com/ROCmSoftwarePlatform/composable_kernel), [CK examples](https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/example), [even more CK examples](https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/client_example).
-
-## And what is inside?
-
-The docker images have everything you need for running CK including:
-
-* [ROCm](https://www.amd.com/en/graphics/servers-solutions-rocm)
-* [CMake](https://cmake.org/)
-* [Compiler](https://github.com/RadeonOpenCompute/llvm-project)
-
-## Which image is right for me?
-
-Let's take a look at the image naming, for example "ck_ub20.04_rocm5.4_release". The image specs are:
-
-* "ck" - made for running Composable Kernel
-* "ub20.04" - based on Ubuntu 20.04
-* "rocm5.4" - ROCm platform version 5.4
-* "release" - compiler version is release
-
-So just pick the right image for your project dependencies and you're all set.
-
-## DIY starts here
-
-If you need to customize a docker image or just can't stop tinkering, feel free to adjust the [Dockerfile](https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile) for your needs.
-
-## License
-
-CK is released under the MIT [license](https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/LICENSE).
diff --git a/doc/markdown/tutorial_hello_world.md b/doc/markdown/tutorial_hello_world.md
deleted file mode 100644
index 297df10b5..000000000
--- a/doc/markdown/tutorial_hello_world.md
+++ /dev/null
@@ -1,191 +0,0 @@
-## CK Hello world
-
-## Motivation
-
-This tutorial is aimed at engineers dealing with artificial intelligence and machine learning who would like to optimize their pipelines and squeeze every performance drop by adding Composable Kernel (CK) library to their projects. We would like to make the CK library approachable so the tutorial is not based on the latest release and doesn't have all the bleeding edge features, but it will be reproducible now and forever.
-
-During this tutorial we will have an introduction to the CK library, we will build it and run some examples and tests, so to say we will run a "Hello world" example. In future tutorials we will go in depth and breadth and get familiar with other tools and ways to integrate CK into your project.
-
-## Description
-
-Modern AI technology solves more and more problems in all imaginable fields, but crafting fast and efficient workflows is still challenging. CK is one of the tools to make AI heavy lifting as fast and efficient as possible. CK is a collection of optimized AI operator kernels and tools to create new ones. The library has components required for majority of modern neural networks architectures including matrix multiplication, convolution, contraction, reduction, attention modules, variety of activation functions, fused operators and many more.
-
-So how do we (almost) reach the speed of light? CK acceleration abilities are based on:
-
-* Layered structure.
-* Tile-based computation model.
-* Tensor coordinate transformation.
-* Hardware acceleration use.
-* Support of low precision data types including fp16, bf16, int8 and int4.
-
-If you are excited and need more technical details and benchmarking results - read this awesome blog [post](https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224). 
-
-For more details visit our [github repo](https://github.com/ROCmSoftwarePlatform/composable_kernel).
-
-## Hardware targets
-
-CK library fully supports "gfx908" and "gfx90a" GPU architectures and only some operators are supported for "gfx1030". Let's check the hardware you have at hand and decide on the target GPU architecture 
-
-GPU Target	AMD GPU
-gfx908 	Radeon Instinct MI100
-gfx90a 	Radeon Instinct MI210, MI250, MI250X
-gfx1030 	Radeon PRO V620, W6800, W6800X, W6800X Duo, W6900X, RX 6800, RX 6800 XT, RX 6900 XT, RX 6900 XTX, RX 6950 XT
-
-There are also [cloud options](https://aws.amazon.com/ec2/instance-types/g4/) you can find if you don't have an AMD GPU at hand.
-
-## Build the library
-
-First let's clone the library and rebase to the tested version:
-
-```
-git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git
-cd composable_kernel/
-git checkout tutorial_hello_world
-```
-
-To make our lives easier we prepared [docker images](https://hub.docker.com/r/rocm/composable_kernel) with all the necessary dependencies. Pick the right image and create a container. In this tutorial we use "rocm/composable_kernel:ck_ub20.04_rocm5.3_release" image, it is based on Ubuntu 20.04, ROCm v5.3, compiler release version.
-
-If your current folder is ${HOME}, start the docker container with
-
-```
-docker run  \
--it  \
---privileged  \
---group-add sudo  \
--w /root/workspace  \
--v ${HOME}:/root/workspace  \
-rocm/composable_kernel:ck_ub20.04_rocm5.3_release  \
-/bin/bash
-```
-
-If your current folder is different from ${HOME}, adjust the line `-v ${HOME}:/root/workspace` to fit your folder structure.
-
-Inside the docker container current folder is "~/workspace", library path is "~/workspace/composable_kernel", navigate to the library
-
-```
-cd composable_kernel/
-```
-
-Create and go to the "build" directory
-
-```
-mkdir build && cd build
-```
-
-In the previous section we talked about target GPU architecture. Once you decide which one is right for you, run cmake using the right GPU_TARGETS flag
-
-```
-cmake  \
--D CMAKE_PREFIX_PATH=/opt/rocm  \
--D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc  \
--D CMAKE_CXX_FLAGS="-O3"  \
--D CMAKE_BUILD_TYPE=Release  \
--D BUILD_DEV=OFF  \
--D GPU_TARGETS="gfx908;gfx90a;gfx1030" ..
-```
-
-If everything went well the cmake run will end up with:
-
-```
--- Configuring done
--- Generating done
--- Build files have been written to: "/root/workspace/composable_kernel/build"
-```
-
-Finally, we can build examples and tests
-
-```
-make -j examples tests
-```
-
-If everything is smooth, you'll see
-
-```
-Scanning dependencies of target tests
-[100%] Built target tests
-```
-
-## Run examples and tests
-
-Examples are listed as test cases as well, so we can run all examples and tests with
-
-```
-ctest
-```
-
-You can check the list of all tests by running
-
-```
-ctest -N
-```
-
-We can also run them separately, here is a separate example execution. 
-
-```
-./bin/example_gemm_xdl_fp16 1 1 1
-```
-
-The arguments "1 1 1" mean that we want to run this example in the mode: verify results with CPU, initialize matrices with integers and benchmark the kernel execution. You can play around with these parameters and see how output and execution results change.
-
-If everything goes well and you have a device based on gfx908 or gfx90a architecture you should see something like
-
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 1.10017 ms, 117.117 TFlops, 87.6854 GB/s, DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1
-```
-
-Meanwhile, running it on a gfx1030 device should result in
-
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 does not support this problem
-```
-
-But don't panic, some of the operators are supported on gfx1030 architecture, so you can run a separate example like
-
-```
-./bin/example_gemm_dl_fp16 1 1 1
-```
-
-and it should result in something nice similar to
-
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {1, 4096}
-b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
-c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-arg.a_grid_desc_k0_m0_m1_k1_{2048, 3840, 2}
-arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
-arg.c_grid_desc_m_n_{ 3840, 4096}
-launch_and_time_kernel: grid_dim {960, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 3.65695 ms, 35.234 TFlops, 26.3797 GB/s, DeviceGemmDl<256, 128, 128, 16, 2, 4, 4, 1>
-```
-
-Or we can run a separate test
-
-```
-ctest -R test_gemm_fp16
-```
-
-If everything goes well you should see something like
-
-```
-Start 121: test_gemm_fp16
-1/1 Test #121: test_gemm_fp16 ...................   Passed   51.81 sec
-
-100% tests passed, 0 tests failed out of 1
-```
-
-## Summary
-
-In this tutorial we took the first look at the Composable Kernel library, built it on your system and ran some examples and tests. Stay tuned, in the next tutorial we will run kernels with different configs to find out the best one for your hardware and task.
-
-P.S.: Don't forget to switch out the cloud instance if you have launched one, you can find better ways to spend your money for sure!
diff --git a/docs/Doxyfile b/docs/.doxygen/Doxyfile
similarity index 99%
rename from docs/Doxyfile
rename to docs/.doxygen/Doxyfile
index ca354598b..1084f94c8 100644
--- a/docs/Doxyfile
+++ b/docs/.doxygen/Doxyfile
@@ -51,7 +51,7 @@ PROJECT_BRIEF          = "prototype interfaces compatible with ROCm platform and
 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
 # the logo to the output directory.
 
-PROJECT_LOGO           = ./rocm.jpg
+PROJECT_LOGO           = 
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
 # into which the generated documentation will be written. If a relative path is
@@ -775,10 +775,10 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = ../include/ck/tensor_operation/gpu/grid \
-                         ../include/ck/tensor_operation/gpu/block \
-                         ../include/ck/tensor_operation/gpu/thread \
-                         ../library/include/ck/library/utility
+INPUT                  = ../../include/ck/tensor_operation/gpu/grid \
+                         ../../include/ck/tensor_operation/gpu/block \
+                         ../../include/ck/tensor_operation/gpu/thread \
+                         ../../library/include/ck/library/utility
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/docs/.sphinx/_toc.yml.in b/docs/.sphinx/_toc.yml.in
new file mode 100644
index 000000000..ff2124887
--- /dev/null
+++ b/docs/.sphinx/_toc.yml.in
@@ -0,0 +1 @@
+root: index
diff --git a/docs/.sphinx/requirements.in b/docs/.sphinx/requirements.in
new file mode 100644
index 000000000..2dfc7b076
--- /dev/null
+++ b/docs/.sphinx/requirements.in
@@ -0,0 +1 @@
+git+https://github.com/RadeonOpenCompute/rocm-docs-core.git
diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt
new file mode 100644
index 000000000..e2b793590
--- /dev/null
+++ b/docs/.sphinx/requirements.txt
@@ -0,0 +1,269 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile requirements.in
+#
+accessible-pygments==0.0.4
+    # via pydata-sphinx-theme
+alabaster==0.7.13
+    # via sphinx
+asttokens==2.2.1
+    # via stack-data
+attrs==22.2.0
+    # via
+    #   jsonschema
+    #   jupyter-cache
+babel==2.12.1
+    # via
+    #   pydata-sphinx-theme
+    #   sphinx
+backcall==0.2.0
+    # via ipython
+beautifulsoup4==4.12.0
+    # via pydata-sphinx-theme
+breathe==4.34.0
+    # via rocm-docs-core
+certifi==2022.12.7
+    # via requests
+cffi==1.15.1
+    # via pynacl
+charset-normalizer==3.1.0
+    # via requests
+click==8.1.3
+    # via
+    #   jupyter-cache
+    #   sphinx-external-toc
+comm==0.1.3
+    # via ipykernel
+debugpy==1.6.6
+    # via ipykernel
+decorator==5.1.1
+    # via ipython
+deprecated==1.2.13
+    # via pygithub
+docutils==0.16
+    # via
+    #   breathe
+    #   myst-parser
+    #   pydata-sphinx-theme
+    #   rocm-docs-core
+    #   sphinx
+executing==1.2.0
+    # via stack-data
+fastjsonschema==2.16.3
+    # via nbformat
+gitdb==4.0.10
+    # via gitpython
+gitpython==3.1.31
+    # via rocm-docs-core
+greenlet==2.0.2
+    # via sqlalchemy
+idna==3.4
+    # via requests
+imagesize==1.4.1
+    # via sphinx
+importlib-metadata==6.1.0
+    # via
+    #   jupyter-cache
+    #   myst-nb
+importlib-resources==5.10.4
+    # via rocm-docs-core
+ipykernel==6.22.0
+    # via myst-nb
+ipython==8.11.0
+    # via
+    #   ipykernel
+    #   myst-nb
+jedi==0.18.2
+    # via ipython
+jinja2==3.1.2
+    # via
+    #   myst-parser
+    #   sphinx
+jsonschema==4.17.3
+    # via nbformat
+jupyter-cache==0.5.0
+    # via myst-nb
+jupyter-client==8.1.0
+    # via
+    #   ipykernel
+    #   nbclient
+jupyter-core==5.3.0
+    # via
+    #   ipykernel
+    #   jupyter-client
+    #   nbformat
+linkify-it-py==1.0.3
+    # via myst-parser
+markdown-it-py==2.2.0
+    # via
+    #   mdit-py-plugins
+    #   myst-parser
+markupsafe==2.1.2
+    # via jinja2
+matplotlib-inline==0.1.6
+    # via
+    #   ipykernel
+    #   ipython
+mdit-py-plugins==0.3.5
+    # via myst-parser
+mdurl==0.1.2
+    # via markdown-it-py
+myst-nb==0.17.1
+    # via rocm-docs-core
+myst-parser[linkify]==0.18.1
+    # via
+    #   myst-nb
+    #   rocm-docs-core
+nbclient==0.5.13
+    # via
+    #   jupyter-cache
+    #   myst-nb
+nbformat==5.8.0
+    # via
+    #   jupyter-cache
+    #   myst-nb
+    #   nbclient
+nest-asyncio==1.5.6
+    # via
+    #   ipykernel
+    #   nbclient
+packaging==23.0
+    # via
+    #   ipykernel
+    #   pydata-sphinx-theme
+    #   sphinx
+parso==0.8.3
+    # via jedi
+pexpect==4.8.0
+    # via ipython
+pickleshare==0.7.5
+    # via ipython
+platformdirs==3.1.1
+    # via jupyter-core
+prompt-toolkit==3.0.38
+    # via ipython
+psutil==5.9.4
+    # via ipykernel
+ptyprocess==0.7.0
+    # via pexpect
+pure-eval==0.2.2
+    # via stack-data
+pycparser==2.21
+    # via cffi
+pydata-sphinx-theme==0.13.1
+    # via sphinx-book-theme
+pygithub==1.57
+    # via rocm-docs-core
+pygments==2.14.0
+    # via
+    #   accessible-pygments
+    #   ipython
+    #   pydata-sphinx-theme
+    #   sphinx
+pyjwt==2.6.0
+    # via pygithub
+pynacl==1.5.0
+    # via pygithub
+pyrsistent==0.19.3
+    # via jsonschema
+python-dateutil==2.8.2
+    # via jupyter-client
+pyyaml==6.0
+    # via
+    #   jupyter-cache
+    #   myst-nb
+    #   myst-parser
+    #   sphinx-external-toc
+pyzmq==25.0.2
+    # via
+    #   ipykernel
+    #   jupyter-client
+requests==2.28.2
+    # via
+    #   pygithub
+    #   sphinx
+rocm-docs-core @ git+https://github.com/RadeonOpenCompute/rocm-docs-core.git
+    # via -r requirements.in
+six==1.16.0
+    # via
+    #   asttokens
+    #   python-dateutil
+smmap==5.0.0
+    # via gitdb
+snowballstemmer==2.2.0
+    # via sphinx
+soupsieve==2.4
+    # via beautifulsoup4
+sphinx==4.3.1
+    # via
+    #   breathe
+    #   myst-nb
+    #   myst-parser
+    #   pydata-sphinx-theme
+    #   rocm-docs-core
+    #   sphinx-book-theme
+    #   sphinx-copybutton
+    #   sphinx-design
+    #   sphinx-external-toc
+    #   sphinx-notfound-page
+sphinx-book-theme==1.0.0rc2
+    # via rocm-docs-core
+sphinx-copybutton==0.5.1
+    # via rocm-docs-core
+sphinx-design==0.3.0
+    # via rocm-docs-core
+sphinx-external-toc==0.3.1
+    # via rocm-docs-core
+sphinx-notfound-page==0.8.3
+    # via rocm-docs-core
+sphinxcontrib-applehelp==1.0.4
+    # via sphinx
+sphinxcontrib-devhelp==1.0.2
+    # via sphinx
+sphinxcontrib-htmlhelp==2.0.1
+    # via sphinx
+sphinxcontrib-jsmath==1.0.1
+    # via sphinx
+sphinxcontrib-qthelp==1.0.3
+    # via sphinx
+sphinxcontrib-serializinghtml==1.1.5
+    # via sphinx
+sqlalchemy==1.4.47
+    # via jupyter-cache
+stack-data==0.6.2
+    # via ipython
+tabulate==0.9.0
+    # via jupyter-cache
+tornado==6.2
+    # via
+    #   ipykernel
+    #   jupyter-client
+traitlets==5.9.0
+    # via
+    #   comm
+    #   ipykernel
+    #   ipython
+    #   jupyter-client
+    #   jupyter-core
+    #   matplotlib-inline
+    #   nbclient
+    #   nbformat
+typing-extensions==4.5.0
+    # via
+    #   myst-nb
+    #   myst-parser
+uc-micro-py==1.0.1
+    # via linkify-it-py
+urllib3==1.26.15
+    # via requests
+wcwidth==0.2.6
+    # via prompt-toolkit
+wrapt==1.15.0
+    # via deprecated
+zipp==3.15.0
+    # via importlib-metadata
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/docs/source/API_Reference_Guide.rst b/docs/API_Reference_Guide.rst
similarity index 98%
rename from docs/source/API_Reference_Guide.rst
rename to docs/API_Reference_Guide.rst
index 3665049dd..b59c6e302 100644
--- a/docs/source/API_Reference_Guide.rst
+++ b/docs/API_Reference_Guide.rst
@@ -49,4 +49,4 @@ used in the CK GPU implementation of Flashattention.
 
 .. doxygenstruct:: ck::ThreadwiseTensorSliceTransfer_StaticToStatic
 
-.. bibliography::
\ No newline at end of file
+.. bibliography::
diff --git a/docs/source/Contributors_Guide.rst b/docs/Contributors_Guide.rst
similarity index 100%
rename from docs/source/Contributors_Guide.rst
rename to docs/Contributors_Guide.rst
diff --git a/docs/source/Supported_Primitives_Guide.rst b/docs/Supported_Primitives_Guide.rst
similarity index 99%
rename from docs/source/Supported_Primitives_Guide.rst
rename to docs/Supported_Primitives_Guide.rst
index 066e024bc..4c3adf67d 100644
--- a/docs/source/Supported_Primitives_Guide.rst
+++ b/docs/Supported_Primitives_Guide.rst
@@ -72,4 +72,4 @@ Else if :math:`j>1`,
       \tilde{Y}_{ij} &= \diag(z^{new}_{i})^{-1} \exp(\tilde{m}_{ij} - m^{new}_i ) \tilde{P}_{ij} \\
       z_i            &= z^{new}_i \\
       m_i            &= m^{new}_i \\
-   \end{align}
\ No newline at end of file
+   \end{align}
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 000000000..9b43b7155
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,24 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+from rocm_docs import ROCmDocs
+
+docs_core = ROCmDocs("Composable Kernel Documentation")
+docs_core.run_doxygen()
+docs_core.setup()
+
+mathjax3_config = {
+'tex': {
+    'macros': {
+        'diag': '\\operatorname{diag}',
+        }
+    }
+}
+
+bibtex_bibfiles = ['refs.bib']
+
+for sphinx_var in ROCmDocs.SPHINX_VARS:
+    globals()[sphinx_var] = getattr(docs_core, sphinx_var)
diff --git a/doc/image/ck_component.png b/docs/data/ck_component.png
similarity index 100%
rename from doc/image/ck_component.png
rename to docs/data/ck_component.png
diff --git a/doc/image/ck_layer.png b/docs/data/ck_layer.png
similarity index 100%
rename from doc/image/ck_layer.png
rename to docs/data/ck_layer.png
diff --git a/docs/source/dockerhub.rst b/docs/dockerhub.rst
similarity index 100%
rename from docs/source/dockerhub.rst
rename to docs/dockerhub.rst
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 000000000..f4e66c1b5
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,52 @@
+============================
+Composable Kernel User Guide
+============================
+
+------------
+Introduction
+------------
+
+This document contains instructions for installing, using, and contributing to Composable Kernel (CK).
+
+-----------
+Methodology
+-----------
+
+Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
+
+CK utilizes two concepts to achieve performance portability and code maintainability:
+
+* A tile-based programming model
+* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".
+
+.. image:: data/ck_component.png
+   :alt: CK Components
+
+--------------
+Code Structure
+--------------
+
+Current CK library are structured into 4 layers:
+
+* "Templated Tile Operators" layer
+* "Templated Kernel and Invoker" layer
+* "Instantiated Kernel and Invoker" layer
+* "Client API" layer
+
+.. image:: data/ck_layer.png
+   :alt: CK Layers
+   
+Documentation Roadmap
+^^^^^^^^^^^^^^^^^^^^^
+The following is a list of CK documents in the suggested reading order:
+
+.. toctree::
+   :maxdepth: 5
+   :caption: Contents:
+   :numbered:
+
+   tutorial_hello_world
+   dockerhub
+   Supported_Primitives_Guide
+   API_Reference_Guide
+   Contributors_Guide
diff --git a/docs/source/refs.bib b/docs/refs.bib
similarity index 100%
rename from docs/source/refs.bib
rename to docs/refs.bib
diff --git a/docs/run_doc.sh b/docs/run_doc.sh
deleted file mode 100755
index 58b0936c6..000000000
--- a/docs/run_doc.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-set -eu
-
-# Make this directory the PWD
-cd "$(dirname "${BASH_SOURCE[0]}")"
-
-# Build doxygen info
-bash run_doxygen.sh
-
-# Build sphinx docs
-cd source
-make clean
-make -e SPHINXOPTS="-t html" html
-make latexpdf
diff --git a/docs/run_doxygen.sh b/docs/run_doxygen.sh
deleted file mode 100755
index f66c038c1..000000000
--- a/docs/run_doxygen.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-set -eu
-
-# Make this directory the PWD
-cd "$(dirname "${BASH_SOURCE[0]}")"
-
-# Build the doxygen info
-rm -rf docBin
-doxygen Doxyfile
diff --git a/docs/source/Disclaimer.rst b/docs/source/Disclaimer.rst
deleted file mode 100644
index 5dcff748c..000000000
--- a/docs/source/Disclaimer.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-************
-Disclaimer
-************
--------------------------------
-AMD's standard legal Disclaimer
--------------------------------
-
-The information presented in this document is for informational purposes only and may contain technical inaccuracies, omissions, and typographical errors. The information contained herein is subject to change and may be rendered inaccurate for many reasons, including but not limited to product and roadmap changes, component and motherboard version changes, new model and/or product releases, product differences between differing manufacturers, software changes, BIOS flashes, firmware upgrades, or the like. Any computer system has risks of security vulnerabilities that cannot be completely prevented or mitigated. AMD assumes no obligation to update or otherwise correct or revise this information. However, AMD reserves the right to revise this information and to make changes from time to time to the content hereof without obligation of AMD to notify any person of such revisions or changes. THIS INFORMATION IS PROVIDED 'AS IS." AMD MAKES NO REPRESENTATIONS OR WARRANTIES WITH RESPECT TO THE CONTENTS HEREOF AND ASSUMES NO RESPONSIBILITY FOR ANY INACCURACIES, ERRORS, OR OMISSIONS THAT MAY APPEAR IN THIS INFORMATION. AMD SPECIFICALLY DISCLAIMS ANY IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR ANY PARTICULAR PURPOSE. IN NO EVENT WILL AMD BE LIABLE TO ANY PERSON FOR ANY RELIANCE, DIRECT, INDIRECT, SPECIAL, OR OTHER CONSEQUENTIAL DAMAGES ARISING FROM THE USE OF ANY INFORMATION CONTAINED HEREIN, EVEN IF AMD IS EXPRESSLY ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. AMD, the AMD Arrow logo, Radeon, Ryzen, Epyc, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. Google(R)  is a registered trademark of Google LLC. PCIe(R) is a registered trademark of PCI-SIG Corporation. Linux(R) is the registered trademark of Linus Torvalds in the U.S. and other countries. Ubuntu(R) and the Ubuntu logo are registered trademarks of Canonical Ltd. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. (C)2023 Advanced Micro Devices, Inc. All rights reserved.
-
-----------------------
-Third Party Disclaimer
-----------------------
-Third-party content is licensed to you directly by the third party that owns the content and is not licensed to you by AMD. ALL LINKED THIRD-PARTY CONTENT IS PROVIDED "AS IS" WITHOUT A WARRANTY OF ANY KIND. USE OF SUCH THIRD-PARTY CONTENT IS DONE AT YOUR SOLE DISCRETION AND UNDER NO CIRCUMSTANCES WILL AMD BE LIABLE TO YOU FOR ANY THIRD-PARTY CONTENT. YOU ASSUME ALL RISK AND ARE SOLELY RESPONSIBLE FOR ANY DAMAGES THAT MAY ARISE FROM YOUR USE OF THIRD-PARTY CONTENT.
diff --git a/docs/source/Linux_Install_Guide.rst b/docs/source/Linux_Install_Guide.rst
deleted file mode 100644
index 0e16bb6a9..000000000
--- a/docs/source/Linux_Install_Guide.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-=====================
-Getting Started Guide
-=====================
-
-------------
-Introduction
-------------
-
-This document contains instructions for installing, using, and contributing to Composable Kernel (CK).
-
-Documentation Roadmap
-^^^^^^^^^^^^^^^^^^^^^
-The following is a list of CK documents in the suggested reading order:
-
-[TODO]
\ No newline at end of file
diff --git a/docs/source/Makefile b/docs/source/Makefile
deleted file mode 100644
index bde66ebc2..000000000
--- a/docs/source/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-SPHINXPROJ    = CK
-SOURCEDIR     = .
-BUILDDIR      = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/source/conf.py b/docs/source/conf.py
deleted file mode 100644
index 65ac18703..000000000
--- a/docs/source/conf.py
+++ /dev/null
@@ -1,219 +0,0 @@
-"""Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
-   ies of the Software, and to permit persons to whom the Software is furnished
-   to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
-   PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-   FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-   COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-   IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
-   CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-"""
-
-# -*- coding: utf-8 -*-
-#
-# Composable Kernel (CK) docuumentation build configuration file, based on
-# rocBLAS documentation build configuration file, created by
-# sphinx-quickstart on Mon Jan  8 16:34:42 2018.
-#
-# This file is execfile()d with the current directory set to its
-# containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
-
-import os
-import sys
-import subprocess
-
-read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
-
-if read_the_docs_build:
-    subprocess.call('../run_doxygen.sh')
-
-# -- General configuration ------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = ['sphinx.ext.mathjax', 'breathe', 'sphinxcontrib.bibtex']
-
-breathe_projects = { "CK": "../docBin/xml" }
-breathe_default_project = "CK"
-
-bibtex_bibfiles = ['refs.bib']
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-# source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
-
-# The master toctree document.
-master_doc = 'index'
-
-# General information about the project.
-project = u'Composable Kernel (CK)'
-copyright = u'2018-2023, Advanced Micro Devices'
-author = u'Advanced Micro Devices'
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
-# The short X.Y version.
-#version = u'0.8'
-# The full version, including alpha/beta/rc tags.
-#release = u'0.8'
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = 'en'
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = False
-
-
-# -- Options for HTML output ----------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-# html_theme = 'alabaster'
-
-#if read_the_docs_build:
-#    html_theme = 'default'
-#else:
-import sphinx_rtd_theme
-html_theme = "sphinx_rtd_theme"
-html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-html_logo = "rocm_logo.png"
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-html_theme_options = {
-    'logo_only': True,
-    'display_version': True
-}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-#html_static_path = ['_static']
-
-# Custom sidebar templates, must be a dictionary that maps document names
-# to template names.
-#
-# This is required for the alabaster theme
-# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
-# html_sidebars = {
-#     '**': [
-#         'relations.html',  # needs 'show_related': True theme option to display
-#         'searchbox.html',
-#     ]
-# }
-
-mathjax3_config = {
-'tex': {
-    'macros': {
-        'diag': '\\operatorname{diag}',
-        }
-    }
-}
-
-# -- Options for HTMLHelp output ------------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'CKdoc'
-
-
-# -- Options for LaTeX output ---------------------------------------------
-
-latex_elements = {
-    # The paper size ('letterpaper' or 'a4paper').
-    #
-    'papersize': 'letterpaper',
-
-    # The font size ('10pt', '11pt' or '12pt').
-    #
-    'pointsize': '10pt',
-
-    # Additional stuff for the LaTeX preamble.
-    #
-    'preamble': r'''
-\setcounter{tocdepth}{5}
-\newcommand{\diag}{\operatorname{diag}}
-''',
-
-    # Latex figure (float) alignment
-    #
-    # 'figure_align': 'htbp',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-    (master_doc, 'CK.tex', u'Composabl Kernel (CK) Documentation',
-     u'Advanced Micro Devices', 'manual'),
-]
-
-
-# -- Options for manual page output ---------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'ck', u'Composable Kernel (CK) Documentation',
-     [author], 1)
-]
-
-
-# -- Options for Texinfo output -------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-texinfo_documents = [
-    (master_doc, 'CK', u'Composable Kernel (CK) Documentation',
-     author, 'CK', 'Composable Kernel for AMD ROCm',
-     'Miscellaneous'),
-]
diff --git a/docs/source/index.rst b/docs/source/index.rst
deleted file mode 100644
index 68adf58af..000000000
--- a/docs/source/index.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-============================
-Composable Kernel User Guide
-============================
-
-.. toctree::
-   :maxdepth: 5
-   :caption: Contents:
-   :numbered:
-
-   Linux_Install_Guide
-   tutorial_hello_world
-   dockerhub
-   Supported_Primitives_Guide
-   API_Reference_Guide
-   Contributors_Guide
-   Disclaimer
\ No newline at end of file
diff --git a/docs/source/rocm_logo.png b/docs/source/rocm_logo.png
deleted file mode 100644
index ee09dd09c71e3de1081f0f4c8b67967f79d768fe..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 355437
zcmV)XK&`)tP)<h;3K|Lk000e1NJLTq00BJ!00G<x1^@s6q=S9B00001b5ch_0Itp)
z=>Px#1ZP1_K>z@;j|==^1poj532;bRa{vGi!vFvd!vV){sAK>D|D{PpK~#8NZ2fh3
z9a+An3+D7WmCR{IF*7r>#mvlLvBk{H%#0RWvP_bhnbJ<Ktg0+L-G}<loxaa}^F`QM
zr|#VT<M%{t)ZW@F-t~SFD^|QyU0H~(_FA+xRiVAM9CMwmxHQ~@N0YtS>8QeLa~Y<Z
zt8jjG6*sQj#Pyw<*jzb_o0o6n($*#1*}0CVckbibl`9w@AH(R-5DKz$&{R{8vf@(M
zJDq@qwKaUby^)lZfQp7nobYl*R7?U&iz-oBQibBYB1D9TV|s2HtJ@ps9vMLAzyMl%
z`rs852A_~fg!B0Yd4*^#uSR20DQe2g`ToVoOUpn})+yeugulNZB4Q(8X=(@?6=l@>
z24dbj6c;>#al<nR=j@%a=46lC5uv#3>WND}L3mVMgIDu&*cuta_&^_eS=Ej82o4H_
zt$`uz<&_Yop^YF3DTGT)B1}mde$qz~qpXa0c}2t>KZI5@16&LZ#aSPJJj%+!qL(k$
zgG2DJwgzV#>(G{;i>9(lbhmY4cxV9QV?&r39mW*bhx>Za+17%lrUumT`DOfG1%-vk
z%*;eeaxx;rBM|5x03RP8xVX5$*wh4y{BBaxGB|Sd7>*n{f)giBproi6Cmfw%Wn~2m
zOG{YV*ud1<23jVjkkQh?VP$pfS5m`4H7y*{(8W<bBS@QCLdn(vs*cW3b8&~3rw?@f
z17Q#n29wAbn8n7!A|VM@$tPixmJa*OY&ho@z^$MN-lgU6udG6FZ5=}D>JeJsfbhm9
zL^ZV_y15l`9i2$%>PCEL7h>8w5Y^U(NLEyPyI3)u9f<4hL=wM$3jgP6L;c7c9YWUl
z2(l)|kUcqp+?iPvEi9q@^g3!+w$Z${gQDRj_~+EYJ~j`oDP{0VuR?rPKl;wx#@5q6
zVCn9+7`b>KgBNaN{rL|#|HB`!`1B<PuH8V(`EzJIw}s~OTWC4AiQya9u<-N=<{mx7
z;*+NszkL@Yx9(v0&OHp@e~94+k1+D+3C5p1!{pQFm}bpCdx?eTte0P7@$1(({puSm
zz5W)<uin(k>+i%`{pNeDy{(OJe-P`;cW-LryC1R6TH}4<bNIRqzRxD>EZ^(wYp#F&
z3hOVv#?rGFn19T_{pcCSA3VXxy~h~5_Xs_A@1yh19kkuNiI(fv(Q@q?TCQG0^OdV&
z3G4rOyXo>3H0@qS)6Om$FYTb={CTuox`c);{!cc}qwCytw65-=ZuT5vDtloanFE`s
zT-e6s!!fBCZs`^9$gYB0W;tBaiuk|J!-=>|*hVD7EFcnk?*7nr_Jq3q2`HLdLDtY1
z@+=K=OXykIK+nPo_HLdCk4=On+nl+DC2VZ0VPRn|uA5ofLrzs6UmQ}x7su4GS56Pd
zbu6H2>jG^TZ?;o+95ZmjK}}a2)b)XcT?AAEGN2z@3e}K2?6Z!<Z<O8ep3Dh+D(8;R
zWZm!?m!Bwj;v+>be5B;Z3c#ldL3pRLtq#4t&8V&`#i`tMG!T3}%{7>4uR%|BKIX@}
zxuJGqX?+Ek_|H2%zks{fZ(?KZ3`R!AaGw9Xix)3om5<NP&!Doh92ptu$jQlpr<XSz
zot+U95rNRi2n2<PARsIVVbKvNudG3Gd<u#)PN9sD%*)Ea^vne634r>xc2rh3psKPS
z?ruIvIhlb9!f9w|7)|9hsLd}y?#Xl{M#UhSuuM-+M?i28OwCQ<dcqMN78dYU)j)qx
zBmoqH4JU7`S~}pYlQVV#e6a21hC6{Fc%71g*97Rlj89^_v<$0tb(n8&Lr+~TvXYbF
zK<K$Bsv*tP8Y!CENK{oqtd<<&wH1+}ql;KsIYh}GMy0tnwxdFEB{~|n5|c3P?1HNq
zsW?}Vjh<7fs5+H_hVm+Ows&J-Yy?9SBN!!QN4T-~b$6kS-?^c&0o64%DB<VGFDOJt
zdL|O%6A>O7CPL4forZ&>BXkY)Ag3q~X&G4p?<kHPJBEOO0OaT9!Oq6^{{}r{ODkv?
z8AF<%{jiEU4iJ0?)wS7hba71I7&2y7P_}b~nv)CET|J=V?Faq9U>Jsm!z?-u=5YzI
zN=kvv$u!vV^EhVb!i61=2mfb2>|_G?n}iT{LWMOnBC?slYi&aee?I|uiQPR&V8wQJ
z{vCEv9RwgNx}zPjT>|(zkl5RWWWw)ce;-Z`4&Wp^&g{u)WQ<NCXKDe(b89G{+d}E&
z86?&Z!!@}CZYczBdI|iqD-c`JhT5q$%-?*5mHXdd;M@(gtzE$4!>_UR-Om`keGm1U
zTc|m+f!eK2G@Lt&Cc>}d(glp&y^V!u&j_XOFh$S}-MoXo>$fmSs0`kFfRTrfG4}W=
zCLTW%p*Q#R#armT`oBR>ECGDqd?(flfwuaM0JZ;T3HPsm_Y<M_qqx2*fb!*QtPpa`
z&tLq#mYzMwq5wVq%{hX5`teJQ-+zXoyN}U(_W_}I7wxxh6M8q$eC;}#MaW$v-~{Lq
zge>9qulF_WULo)Zy^A|&WCzf8@giEc&ZCJP%Fy{csGZzGXl^S^{L-Q677yd#Y}hB2
z!!5g>RSU<Hr7(?1hpsO>#6Y$W|7chQM8m`<47x5p(6D!bvZXC#jZ7h-tp^1{Pv6!a
zTBhbOWCt7^9ShHZAea$?LK_u;XJKgpGXhaV+Xx4bE90X*GI)Ql9KMiH$1x2PXxY2L
zjB;Y=5eQ`)4@et%;FysQ<ecN69bN#lv<B!URO6UyD&E)f!@KhC_*mW(pUHdSa|IuK
zs_cu8mHqIsN+3Q_3C5?2p?IgYz6y=?<>=^XMNwrjnrV7XZH+V!njZ~tWTGE4OEb7~
z>pFJWNX9$caQ{3_kDyywoI__<3t=^it&P(-yRn4UCc?6-4Xw=$2n_Itr-ui^Xe%L+
z;V?8ap~V!Supl2{p&@X0ut#%#9%kEHFhAUj;ek#Rlonx>4L<)=5t=F+aWXaur!q3p
z$oH$Kft{k+mFE<qGOrjpTtAtTg7~CF*g87E&dwIDwzlx%KiNk`12s<W=(2IbxRpKn
zxS^lsW0!(_vFL1%^90F*#1w3K_+Z7|8!Lq1a$F*|%PKIOmxn@HsWU&Xy}U9K&8(5h
z9xFms5pGiZ5U;0-lZ0xXjvjIiHBe-zf-y%Mthl*i%i9;1V`K1hS1WGPko$|WP@S74
z!n(I}03(xA7^KMz4-a908+lh(Ct3v1H#MS?&?_z}MQ&aJGSV^;7ZZoD;7|nk@&D-Q
z2^)TX4Q)-x$jkm6deYL;NJvN!t<a8-3(&I^z(>0_Vh^fDa7(Id;-In`_7i*p^bTpU
z7d0?}thqH*9Zo>Q*$tZRUeNOmfFb{$NkkMZViRDQm<*egG}!U8*t2tTIz_w6FM>Ov
z=UY*Qz}i}bG&CTTz!NR6rBwvpua=k8(~Cq_Y*!Z{N6-;|V(~jibqcok27K`}z67xd
z%3i*vuOI276Ewh4n%^7>XI4=(xrWq^N%-Y75_p7NRtdtH6cVbNki-9L?c`~UT)2k$
zTTjrxeH9&NE@JZ5L(D#WhSqbJP_eRxsxzCY+}uR<*)0)#g7x)XzlOz^FR}T<Pnf(<
zv%7sCz1MD{|JFT%?*WDg!ZF&O0KF+1-<$v;!S+PxeM9I8!29n1R?D=vUu)&{|GriQ
z;Jt;OV2DeE-l70gf^P09{|;-O>jGdGU%bNUufM~>i*GUY=p}~VLa&QpZl{^H-nd2Z
z-6Z%3zbn@XIKlF0w!aqdYvz4TyH^Rl%cAKuUOY$epGEuG3+OttgQ2sxkybNIz@$OP
zB@spe0`Mx~Thxhw@^1JPwctcr8BD@XLc=YX0CI=Aojdeg{9r^N>7MX}s<i{;O)Ma#
zYXB)7eJB%tx|Y^ZC;TiOPaujyU~|Hmre_BWD+`!1wOCqP!PL?QvWi;xd>?`Lr8Iu?
zg%m!f@f}w;fdSvwF*FJeVX-h}YSXaygS@pr6x|b{9a#*^>~>gWcS1G10{d-}@R3Fk
zK2-F@=gI;2f@b%bY9KySBka^dxD3M=Dv@|+gblQdKp7nDLoFj#YfleqnwwD5Sc}%q
z7PR%WV`c3$Ze88Q`I$*9R2JdxSRZyKhOjcygR<OY#>+U|ySRq4Gs9S)97H9JF_bpt
zW@`gqXIDhi9IUw!>gwvEqM#UQv9a)Ru}8RvJ=QwvF;bL?^`TbuHJ2kQI)u?K12co;
ze6M+oa${+%EJa0CF|zYAP*z%kYDUEbnn)~7@8ro;czAljmOY6(qqGa{#@EUk&Z?@2
zr{&}s8luPE9-RjI=+V<bhmIQRG?mfmV1p4ichtxzVAj|YS3LdkFg6(vvh#4cyaM&<
z$p~^k0UrV+FDwFWjGN6TlaT4<j1qpgdOK&-8kwWqP#aygCfM+B#jcMp&U^XL_7d?w
z#s~3wybrVWWoRzSM{i9FCVIv(H8z8Bn&t5DFnas?(8ie8)Y^i&#zxv~H42JLa4IJc
zX*Ai`s91#1yaIgv;o<HMbM}hL8mf?#lZJ$(ln6a##_T*oI+XF+&W??VmTOJRvtSvR
zTR=rmAIDS(JVli^(9_hxA#Hs~7@9)P!Uk%NF3@!GfHtG8fqxJibp*`#f3=EFf^~AL
z2t0?(9MSTGXzR+~&!e~$ewEb-q8W$sbN?gs1mFpg_8*}aN0`OD1zrrz_6_s|+v^Zx
zZmd|HZ;F77qa7yp4-tSvv^{o!Q%lGnTS8nzKYUKrAgZhtxn2FJpPoV8)HIi~=-)ht
z)dx>;@tfabg<$ACe;M5uE~AaWD?7b{VpjRaCQ8;eP<3`2wF2}mY@_Sy6^z|~fEhw<
z;QDQh+<Az;YqY%^cL~1x7-EzheaMLW=qV;?d^5DYS%PnlRyI#iEf9Q*gx0Thn$Qz!
ziNO2E`t^SCG1l^H!cHjhF}{}8IQQZuW}Y+3(mW@hJQ0_|-x0vN^39KY{*Q#=YmD4|
z!g%`-9e3`d?bcni+z@Q<4fL95cLMZU3GTnwui)FE?OhUVZwK`kw^4U?lcslpW_%rO
z^B3WpTnPi01e#kWOoQ{_oZf(dvR(vMbQ69}jJ9R4icE!$dl2MI>=|LLp=|C5ZF_fU
z+B!qYl(CkbsQ^5AJtL@7K9uzgplxaf4;r3Na0pD9M$E|q3oCOPo+)FkC3FqVa70oS
zAATu`cR!cHdtb`p)5FS;)G&pyvo~DAX^7l5U89m<8x#japE#)dra?2J1V*XNFgw))
zz2r8?_!QwQlX!fh9)izRL-3_qD8A4L!{-{|_*^rR%P8I#gLf82MlsOQivHei401yp
z8lOT(?;vXG8fYCQ;(n%!{O3I0-NE(o5iDn=U?DjQ<LQaG(Ak7D9d%f3sls-99ePq?
z(Ha|$j1%^VVAM=twDmAHMP^J45@>pkHkQbym3i4)!_ia|c~Rad^0vqGxk2oXwIeIi
z2l25X7#!-ueg1$~&TL_GVFst?M){c<(a66sJ~D!wjBF$)B*V`y0D(av(A3m|9eb1$
z)MhiTI}voQ2D)&enI-7!U_2}slWr~;GB-hkiabgbr4fGk0D|}KMb5!v*rX9Y3X8&q
z{RvzO3CHEaA~a{G60D&}i;qKfej)m~Vc%dpdUgE@{&?d$=29}z#Gb9m$`qI4BXOVK
z@lj+9?#0IAIZgJ5?q)n6>&8TFG3pC*G1StH>HaB<4NYKtY?9w`7(IQxBJ>(sno&bz
zDyynS9{+assa&MeuwwXr!2!YW_wj?9t1C>*OrfZz3<+5&96fd%M~@zZhK2?zDk{YI
zD?rcO+#I&{cI-`UMd&Fqh96N<!CrYK?4#`+($d9Y9Ro-jnL(aWMBUL9TCSeZ@$!XX
z02^gEEsut2O$)Xqcpdn^77Xu1PQDmz-3m)Y==oRG{2h8>1fBprA=>_GdA~w0skiTc
z4?QvB@;+9K06+1z072dsUf(ys8bsRg1Tu!FaB6r43C(?ouI@nf)H23)cCh{82^MeP
z!1&HZtlYbg3xvbvAAiO<0%Bl?X0x%4%B3~rP0t~J{xk}g*N}6185P^xs3!CpE}lm#
z-=}@&GNbMlG@reQ!JGF9y}Pu%d+6i(;2qlEgC`gx_yp()wkO2g+2>zlmIgLQ@XZr^
zzt(~PKLKt6%vi$uKbH9TE3UIn)A)p1WE`G<^%^rTzsA&y7nmUUMhU$M+TJXUa)HlX
z`R=zg!{5^UzQrgbZa3lFPAhGrkqXdjW+eYR_z1XvgkGCaR|vjc{!jQl8wkA`0>AG3
zHkui4`?jy6d|(MSff+DzONOCmDvSb8!8MZ|TFn5$YkLt~-U6STGT4MCK+DMsDi)5A
z)-{KWo;g&^?4fFI3k6DmtiCa1bq%4YZv;(K3&^Oc!<6kZA|VkDlne{5o3it^w6=hi
zjTH<DMtM~oe0fM7|Lx<Wc;}<zcz=%qK0m4k87*@dJ9)!7BnG|-X$U-d3Vz9_;1Zt;
z-JnxY4=sU4Obv8WTA`iP4*8%e9I(m2XFAaYTm-(-h{P9~QTReDnia!}<9+dXXJVuu
zy&a8cYiU4ddk3orJskt+ZSBEOM;}_N8?ZGqkE?7r=j-dS7{wSE;)l7gPz=y&7NerE
z$%whZIJlXZfHRTNxK1;;z-sdgL=-nvFH-}Ax;fDNOc3Jh3=0DlIGd{>)88KB8PWJ*
zaRkp!k6@yy6brM%SUt0ZbK6^3TU|msO|ZVD9HaBY7@inJF-<HXGzLjDk+k$oc+jql
zj7?x;W`Xp$1O&T#Ai~=VaX~&v@pDI}vmM%eJkd@VjImdlv@l1Hp#hp{gej6TNZPX>
zi|(E{A0CEng85-tDQ+@uc2}1mJ~0Ljgxws!|Ki*NZtPycZ`nBi$G6|&>EaUVJUmfi
zsDn`_J6vNde3hDkXL-4JLNndYOT~0vGHNnnQC*UYp7t&b6L=#NQ|KEWLI2<&x@pyI
z9qni&^s4#4EumrN<QE__>lBg*^@y;@H_)T$IXXH*-@pKhDoT))k;1X#5;%0|FwD%%
zP|w#22RVdjYbErej0{$6T)OOGWi>RhUrqsE$thyrThlwpcq?sU2?fS}0eaeQUeNXS
z6QO4o5e-YmeA|>amgkuHCf1&y<+(80dK8z5`}2Z@eR%|RSQCNQ)Qkw0V0y7^LxSlg
z6L!Kry?6plu(w~KC)At$dT)RiB^=~vZ5I!8M6|Yw2RkC$8IgN?kiht#!p<Rmcns+S
z?Cc1@`lT}%y?h<#UcbQZ4`1WLH_x&5=pHtnJi^ZR-{b1fzsChe)j2|{pN1n?UM4%4
zQ~V8c7M75^MC;qwKsh_|s_jk2yo+eLxQnXQGpJhGL_K5i=-sF2+`UccG2Svtc3;1P
zQJUV!gNHOQ+8#}Bnz3_6fSv$5g6tn_L4cb8IRSca>(|GG=g+-lET-iNB>-)n&z<4@
zlZ4*b^A{LptR3Os86jZDh5dPa&N895OpBc-^u}qjUAG?)c=yqI^B(Js?KKf@&8(Jd
zw+K5z@G5T;fP&=-!28wo1n5;U-c~T?*KM3b-Qroq)8;Jw(_rA12z`%a==-O`Cb5L3
z*M^YFHUt({!8;=l_Tka6_78!Hn=kZec(QtCIIf`&MMDcH7??p)Qx9?i^z;m&##k%I
z_U9202>-Bfn6o1`XZka@wuGsL39RjGpiMJ8D4~Q8_sHO%K9j~jeJ+Fd_OiW7XhKHQ
z0=m|2Fm?8aonIu}qff#=B?sQAB`^suhLT?)j(g=oF02xoNgYs+X~i*@0(_<yFG6mg
zPCWMMCE_bCztT;_Ui}p8(>;lIW+r;kQdca3ucxC8jkQhaBlJePhR|8pir%J9%=Hdq
zhcRQFjeRpY0V_fN7^D@AIeTK-)f?kZt{CD*xfL3QHEuxb?!MRzioh(RQd6)$Vy#W!
zt|^ba5Ko*6@u2;w!plMf`Qh%^ZY;q!1lqTo^H?8j!<p3?%*;(-W_kjbE}TVcTP<PN
zgRYS-v~;$jtFZ&6S;ff7%td$vp%58?tlV6fG0KM1{7xk&qrQ}ORbGfwF(Fv0t3sU%
zBdL-q8WmO1qos{G#}k<I_D3&2cdxSxF6CrlDK!Dl20HOA|L(1cF*H?`p{%+BLnGrj
ze_<C7xDh>Iqkr`90e=4aC7x|D9t8!X+{6fzeE&;+LHOoW0bW*C;$mJVrW3=_mlA=x
zjCeFv7Nft5od`Ev;b6?@=m-Y-`q4umv@+T@(3YxdYf)TQhHN3)66Wy)Pk5+s<Ten#
z-oCJBMAy;Pg}jmiB%~#An9){}5mz{JTg_N3Se^htA=+9A4U0={b_UYyY4^#><12YZ
z?4{`)AoLFW1N7ARKhSpfhMtc<41+>p8WAHx@7H~KLbMeuPk^2fZ9PlM3B4)=)z*s#
zFob9;nBK3@`^SBHzaF`b{`DXRO8{D=06MV<zBYo7(2Hy#^jca(@P)I}iR^4cbaw}m
z2K$jZ%$Um*Qp7mhdubQzFJEAru)FyE3!LY2^TlIqzj}eIzxx?C{`g1i{{9cxAV>yj
zKP8LHNE;nT8atZ|cHEisi^yGBM$yJ9$~HGpeU`uD`HLuD-aum4AX57$P_c9tJy-9e
zgWwyu^B8>u%;3F;;z5nkhmSD+nAZ2~C8l4z#_ZPw8KLyQt9e@7uO-|Tp8IPpy!u`&
z@qX46p*PMbJ3{CU3(zB2hZwnsSrg9$xbl6z{(->#24jR?H(}g%`yrYc!-aYSzB>dS
zqpna_ZxMKJ_VEe(^L~Y1<HgIUySRhuOZ?9K?iH(BD4AMCOl3Emq6=W+o60s332pC0
zSjXohq=LU`J%7VW{@%httoQ`R%}BU}v2*kcfQqR#j;rg6B_Wm$<Ow|`LQhsh3wo51
z(3n`b`})F^ov{g}!^+kg=0d!+vV^jxKE6DxfOi>b|LF@k{PR}|`1mk8XL&<NshdMV
z*A~h~j?l99gqcT}7;mjYQlaXa3Tc-t9P`M5bWjCU<J+JS-w7$ta(roYlAuc@;F7T4
zfS@xtiM?W_5rXM>XSAandC5WOYAQo#b1hmLn?>8}?d(HkX%%|A2hiW%i_?9h*zF&|
z9X9eI4=+snhGID^9s@33s5LQT)U(8;w38zEPJ4Od49UJ07cIu&bQ5i)8E9ZVCl&YV
z%Fz+zMay$Wb!IG9TB>nld;k|l`>;JTip9wh%rHh@+1$d_%}p$f4PpDt3Wml8F*rJk
z+R6r0u-8e5jz?;0DheygkX4umQ*Q8~fq}HDVoVPYpr#}rWhWDGiB>Ta!_R5qfB_v-
z46!$A)6~U`iwB0>JTM#)il@^fn5!?v<@s^krd?iLTSEt<+5#=%+T~k#^57YM{O$(=
z?>-(qxsRW}{TkQj7tjzGf@<pds&626gCg+@JB;7c;4T#B6MB&tPL4)<MiN^2`Np_$
zFVR#M`-d<`;EmIS1_*`@Zs-m5b*QSULQ!!Eva(Mh<zyOS8Iwb4j{@|(J-tMW)6~?2
zoPr!Aq$I@sdGhiKh=_<letteIX|blJrm!Xa1k*F+^EC_%A)(A@D<g+5X?uH=Rk2?~
z8wYjtA#G~)#`I3OLC3=f`m}PR;4qj)#{L8Jvf!AND?-otZM5|&t%PqyHG&1`)zynf
zZG~tXN$3efRbmOSxHj4zffgqKkN?L=!Z4CB`@6{zj6y}V3YNzr@FMwI!5G7un-JE@
zYU6u$wIaTsF?p~TsY87znw`SnwJSLL!?)P__64?HKf}h$Cs=39J@?%=xbpko<H|3;
z;KFZzhZ%NK^&4kJOBAAV;?M|EXng6j^T=9UM9%VQloEQS>+7i5I*01@vj}hKKyWqT
z+rUot@;$U&x`p1G57A4wbP*`Sgx=5t;fU@dM$Z?Rdj5)$^cxXkGyhNm#Qt&p*W1&4
zu28=|CKl~a0Nw;+@+hG<MA!}Ty@v?Z0Rpd&(R$$FQ^wt|2|t4I@hc1q(7X5aE%fe*
z&}$Kmk5Tv99W)boO;>KBiAF3`GwbismK}KQg-a+ucMhdn+bCMvK;Fa>QrgB4UeW=#
zlrm_zMM2Lm5%!6Bh^^~FM&}?-Hnk#-NhBmIlQHuo0+UnW92x}!cAAQYW{}X(#W7Va
zNT_Ksmg+)|C8I7_UJ!zr25p=iVZ_vDZe<|?&(hWsy2i#hCaZ>z_R8Wngxx>yQNafX
zHSmR`9u6y-;IM)*4#*f{zpN3Cs9HhJ$OTH4o{+Qf#bLt`95PSDA;(Ne_?ALBrVW~j
zy^!~>#{tVz1YIf)5^{%((s9@{69-K)vEPVgkcoHNa$=Dc;*7rfVl-A1p^J^Or<)ei
z(~H8Q5|mZdps#lb^Fvd(I5LCn%37=>r(-294gDSgXmWHxfxbS*1N^ZV9E1(p&qiPn
z&d0>yQbsD4(h|_*=Yl$KC+t)e;QPLIY~*L6voHg7g<0sUEWuny3kK_}G272*T~~vp
zfdO2bnZvcoY2064!S&^3nr9~=T!#3l1hh7^p_Ut7w{TQ2F&<ul{>aYGMr2qRsu)ja
zCq^+pHHx{OPE65IdiXh~oqREC=ZQHFUo;t;VU&^f2LkGM=Qi-)zkZ07=|SwAU&l93
zX$@=ZSYZ^p%ZC2s)&sn}`w+i-{R%&Q^8(*}_Y8NhU&8j-80teJP-AP4-Lwom%qzxk
zJ9==Xx(YR3Zs-dO#CS>^22W+6EjtsVH4Qj7HjVAUVVow2=Lx<sLa(Q}1&uVdN?Kk~
zQ4ynUHj-0LB9aXyI5-3W{%;Rr5bCO`s*shHfrO+4jvYG=1qDSE6civXE>47=07l^m
zuZg)COxTO6>gnOAk`ngFzJcDC!oEHBzK3)TMCd76JG_COmoE(bgJDdwG>_tY#U+XR
z^L~Y%Z~#Myw(dn`@Gh%@AKSp+p-0<`5DsFnvy5(PK`a3lCxDL?!}TbBKfwaU2u#3-
zzqL8R?4tP`q2566&GQ84g|-l+?JbDvZbNKeC*u3M2(C_Kjt(+@u3_xX4J<r;h?xhs
zF~Rle`**SS@+Hpw_!Bn2Wt=258qaJajb<I$!oShkjW_{(!=p%@nnvo(G&1IAQFLY<
zg=}BtYiCfmc>$TjGw>;CfKyf}%9pm$N$_=Cy@#%w_t8#!>bv&<{rB$^I*%~+<T=6j
zQUsr1e7^$iA8U&BE8K+jf2>KqW|E*2U?<er%U85K0eUY9y*JhO=oz~2GhW|)MEE@;
z{GN-|_ux4??mj`wt%qnL^ji4&glfKi7fn|gbFbV&<K-Ky|0ne7&+nk-+(nd}q2Zl5
zi^9|ED41DA&hTkubk887q6-!gnb7o%f_Zo<!pj?x(cXvD##SU&RU^8v5K&B4Vd+_L
zi-?AqyB9RAY#}cUfzsB8thOFxDHReb>M*pnMRf8>*t@!l2QQ3FO<==FYiwrBBw`6A
zO>KN}NKUl8clW8`gF`y_=$J0PkT$@V1m73ObnwX`ZG3!47oSU-V6UPz_NrRrb0u4r
zC%(`O!(Pi2M%+@U#B@M2p$|$yEs$_5#Zikq95&8kW#WkWDIB)Q!9mkgIB1-McP4U@
z(VZTP`L;TAGSaoxHDIuR@XgW1fqqmrG@z!T39XE}JtZ}$qtQ$!rDG~433U$6NYc>2
zL{tRMP+@nY<1lOEgqzU`xR-GX7t+#krr;E+LcCC(6okAOZ;X~@Vz4wDH3VM;jcBy5
z7bCsB=<jGpLum;r7*7{#YH_i)0ShNHai_Hlk47d?9h-nqCuj6>18b_O!O5f~^zpS}
zk>SWXl|=)rL@*Uv82o#2=R970`y9#PjGsP)Pe>fbPXu7x#UD3o>+qD%8H$O<RmP{s
z7ti3&zxxI+zkZBMJLhP1_i^pq1#His#t-)&;)lC;@$-Xw_{;N0`17krc=`A$))vOl
zMMxC}hN8g938Qf_=<xGHotGyvo$L{3W`sgJYxG5hV=OlteQYG7g!GwS8ZG~)n>}4v
z=69ZD$1)&5uc89w#YN&!p3JN)B&8%HEIb?mG*@4KCl_aD7#kTuMOhiLGBS{oloStQ
zDAVG)2!3JUvIsmzT4A_~DI<ijr3I99bl6+c@Cdyx2)!>9m9a+{zC!59m|H`^#sO*~
z^xo{xGYN}=1r6FNK3RmGeR}pkK(CmlS4!wri_i<HsYfWm7smDyUf(D}FGheKjV<PF
zMe;u3vCy}Jm&K#J&2IoGMqvUemZm4zp#Vq$d;%0hTlk!Inqe1#)!Rj2^&plVm@u9%
zYjgyq3v*~azm3MNGlbv<x_5Rl`}nCi-fia5Q}l2HFPvVWY4$LBw;+bli|OtqK>Lv}
zGK|FWQKU>y()bo|YGDz@%j>9IIg7G|O}OP&!Y;LtAn!x##oLU8HwnBu=wN*8ynTnj
zxsQQIjJgEU=;Id{e<~Q*D@;Bo#NO7_3t|7<o0<}U#}d{j2s)vJ^>M;-oa;i3u|{9e
z;NAjn;K@t$^Syf>JVn>N$LJ*Vgk{(LCvR@wBk*oMAn@*^@wx!L2XAikb2busjWoUg
z9rVO#d+riy&R#${JB1QP+rou)<P&<?ql?HKm_t(QC_GP9L&rB3CLu`(F04Unb2pOf
z_}i9OBQ(Dd!Av8*iEJZG2iolPl_(K%dPY#tH-eIZF%%hZ<urBR8y+D>TSIe3S++-E
z^nq~j!ramv1}3yTIVF6sM-uPtRl<iy^zgBS5k8SJz$XOW`v(cZ0~+{XpDsQ+V1$p4
znDc&Xd?aIq52Oh`VJx0<AigqAfTU*`RHHkf9@`7WfELKQ)j-yv9Fmp=IAV5+0L;Y^
zt9%@`WSQsVohyB<INeZ=x%MW^5F*Xhb!cM)Y-wpiLvuYEJ6lo5^`?eq%ybW8v#keb
z*xPsc`XfhI7fEW$XmN4G<+M~>i%Y<HpCFv|4#v&YQ#hBEgW0?swB@AXWLyx^QzMWP
z>4)mPbPRU2F+#SWnt&@VDHCILdqpK`<6<$JorTTxbZoPi*yTodA*T?NDe0&T48~<P
z*oo!_bT!mrc4`uXBZHVfJ&(xH5M-vLU~PGc@FMiS{~D8wM4cJg7>-ZHbVL$vR5#)2
z!YuyBlY9728r`>-cktrzUEF?j3ojX2FJHZkTi34Q_J#9UA(;O5;4bSH{(A2k{&?de
zes}94u5Zp_W~c+DX=(5^w}jd8!|+s7fG__K0h;RY)z(A;t*qvRGsYq!G0vzukyn73
z${H+lqrO06IX}>i)9r0E+%63BdpDJqqBJiTd8e|Go}MO-I}GCY@uNL^d3lKkF%0zd
zp+vKjm6C?!aS0qgd<6RX1{h`J7akXest9E!4EVIMh5>C)o^kkq94${qR)pT?Z=rXX
zjZ?<L=Kl|Rq5lu)iKdtRYrHLjYe6YION4k^!x&hHkm_25*3==K;EQ6%7|k{!7Qa&@
zZwvPe&j}WaosD?b27xEQEtZKQUKqs82rN`QBXNung=v9dY!{(?Jd_<_WKTC?)kpBX
z8Tuj&c1{+@@r@#FWCYpclc-wVz#ySCOIQpt>ea4o;S{YpzJr}2p%+1@M)3V3gaauA
zX#B_!lBXt+G&z9`#@*7@O%%<op>%Eo&RON~$ghP{b_L3o&M^*N6ZemG+-7{dag(v~
zE_(0Z!yx1H@WVIY8{^*<;P)#4$A$U_sEJ_X?a^mn|Gh>CH*qNzfydVl@qPXgc-@3t
z2Z7gq=ONlyt#=-X)pGj*>aX5G?d4mj*}aJdt~U{Wjn`;;1YZ-a^bPcG5PB@Z@^;=@
z9)VZ2c@7nf^To8h{MlvJ3jdFb$R1ln-q<oy+Q$%5+6@2PY9ur2=XQ@GxxNFTg%xm1
z&4L5lh%u9rmW>_c42&SHL%Y*6fV>`Ir=!nks|N#XJ47X=(DK}1!W1Gv&)U`+rnEd^
zsmivw@0bkU`$~#%Q^!Zg2{vgnd?;yvcaLb|pY|%_H+$9b{$722c!2jGvc`wU?eT$(
zJwBFq#vVmKe4!MCJ%)sydkK^x+MpEP0eQb>D0ntN#k~Ovj+KzKDZ(+U0vr>TLRlB%
zo%6FJnCxssdtDW#$Hvjs-i=1~&TXyK+RP*rR~4aSpc5V4ouZXpoS48$c{v9Be9%j)
zskbmj=Ls7uh6Z3aIT6<xjn0R~;#_hj7BdJv_U;0pgu%+yRh1|?m4{P=S#i!O46-pd
zHn*X^z6IUQ?dU11Kub&nRx(p?E+qz67!O~R7U6nECeHFTTU@>#?8P;Lc%#1`wKT&4
zVV~cXi)ia;Wh7m|)W9HGs%vob)>S-ucn8yc-MGK8ju(uqf4y@XFA1*~cdp?1wQIP3
z{sLZp|1IvndVz0#_yNzJKEcaJ_wfAcMXWYg<8PNY@t55-{7le29O=L>m)3E0VFXPj
zrx51l4P$i;XvrRfnTj-QmF3{7sscZCEu1p9#auun&c~%;J~kN(S$S9}t-uV8Zlj|C
z%Wbuotf|H{;V?{FYAh^55hHL;W;#xuOhz=#Q8>fI%hw04Zf>w;WYyKyfufu|WF(~^
z$!IHaOoFg;5=~Etw!%;q0ea>F{0Kc=_M|duYS=3+&A1>VLhmyLB{AL}VQ>78(9`x5
zhO30aG&~xXF$u6vNcsCdJqLDNj^feVLfW1%T%|&cxPgRtP-QiOt7;HN@I}zfBkLN)
z64t}{y#yc%j|Z^?=>69K<~AgDbs(9hozmTfWPXQ4M*cYAU<bh$&I%(CL)$xPd%d*1
z0mKT`-H+H#q52TjL3`t42|a_zo|r|$*^3yU`LtcQg5tSlqz{fGfzJ!6Z$cp98%l!=
zYimV#C+)Gn5AkCotTCibG3G99Ab)xVMKkLNsvvBuyWmFfr4G*0js#n}@#d%~q1S%r
z7CPCE`tLu&Affat_<n`os89mX-a<{NVXhB7d4a*l&;P3!kA)f%_TTX}gM9tKqZjBG
z<Es#BpU~(Yi_mK&<XUduN7Jo)Xdv7hZr(%H?hTY~Uq;d9MU<Vpj9S7^fL^0$dxGf^
zhP&6%DAdl?H~aFoFTMrdmU!T%WPTL|vr8yiSVQ6b8X<oc4QDQ+YH=IIqsvIB?ZwH4
zen#7V1Q*ocMA9i32FF6(%?C0jW>7FTh7^G(p`{~2PfA-Cl5GD1^jrgi;TswbV=HU%
zh^?uGnK(?v%-k3zW+spllE(oFe6UvzA05`g`$vuN-f?5RdrS}i?SLBoX`d>gr$Oi$
z()4I|hwSm*F(<q)a{`|zcwnDm0QRYbV!u%mj-4okw0}Kh0-GV{+XO|gCTMuKK*OU6
zO3pQqq4i1Fl|aJ26q5F3cxSx52IFltXsam3ARFEI;0XE{1$w*N(bi5-HC3Uxx0T@Q
zL4Q9ZS$_|$r4*B~5m<{3!%BD%He$jt=y3vro^CiD8HFn}f}M&6oT+ceXj2=y7-8C3
zwKWYG=ovy`P5~-QN(tC@6c&~sBP|=#!{Zp|Mln~EkBh~*I1?L+o%nFv&q~G3lq761
z?p@Ewz-!v){njR2W&~g2hOkTsuPrU$%H>Nqv$l#=_S{p$Be;I!GVVRRiLv2sTs^yi
zU!J|dFE3u;#lu^8_3S=gKYoITckbiqt5<mN@+BTUdybdSp5cdQk8o{a60?OF`0er-
z{(NZx&*!=^l^%_67shdOW(2u0VQ{jtfTFB4v=pUbq$~qVWqEjLX(CWV2kE+|nDz?6
zjpS3<ic7_M+9@n#=V3G}6Q`?7FjjPmMSCSQhAXSkn3s<N8eUe~N%8#Ti0DZ81qQ&=
z+e;jZV$H~^t*Hfh##AW@NgO|V9I~?V@b?cyC0{Qbz!0D(3;?$x@Jt9kEkofThBCh7
z^FNo8!RN9xJq2a#Q`g3E15?Ob*g?_S32Kh+(DCqxfqxiGLt_ZNBv{joZId$yy=*vS
z=ECVzK5dT?H@65Lg=O$AsXzeZdZ2Jt1}j7WAI&eEzpqeX)%Aa0h6qtuc#JTNVF#Dc
z-i0K_%@kVONm^VQV}5E6?X9c(zo17jhSUDSX~N;1J%}RgqD0UUfbD&V>F6i$xa9r9
zQW(TsG{20db35V?jJ(Ntqz#M_to?)@jYY6h8fGXhGFXVtU7d(zTaKqKCesu%rxuVi
zwL}PP;biYLqM8O7ZwF|eBWS&J2hEoVkPDa5e*FeIZd^yt?OPbQdmlpt(g@)+Mo^8i
z#t5zvp$NEPp%_=+?4u*h2)ceo;(kI>*jLy4_!)YH0~NggO}#+xqvz=5<K4785qPY&
zyARP!;58C<^*8Ult-C0_a22`h=aI2^2Dz)-sJL()byx17K_~%$LhaD@E?q-CKYK0z
zH&t(AZQ0r;N*7j8#?GO1eg&1wo3!O^G@ac=_s(s!5PUg()9}x!gjZTIJWdwEDm)o_
zKH*S2;ltQy4+#T9D6nId($U3HH4R8G&K^_Mgru4_G)*iJ6deca6D}~bA@I1Zi2Iod
zJ6k(jsIe^{JT8k*_etU7{Ys3kI(YYxE`Gy$m+Swp{e<3r4gB*yZM?VNn3iXQ_m4Qf
zfu0nhC+~r;<O6U(JqkxmPU4tdE+pN`AmdX51)l~ec{M=OuN~U{ozU=UgMw=vqzFDK
z`*O%RR^Xkr)=G@!Ct)~04O4ZM7-|x%s|^Eej4f@=XzA%dBQ39}wiY!_O&FzVo$hJF
zbZ#1!Q)00h7mdZBKwRPmU1)BK)~E=qveAx|)nKW61arM(80{Lu=)fqgq#Hv+G%?2I
zn(9i%+g6n17a=P(gJ2{atE$9Pz_yBuae)xH7$1WhsmZu`G6}a&p2S1K;Y?CI&g7oL
z{h@I@nOwqVWfLCFE#cbsCPv4Hv3>3WRyQ_ifPL6Hw~V{j&*Rax%XqkZ2Y>$N7d*dv
z4=*3y!w=7&<4-^S7Qg@QCp>4PTi@8iC7RziY|u@KahRoT{=77fU#_p<?m`z%7o_34
zxk-Gzwv0l4=M(nUu(Pp%y`C;C7$F^%l;B14^HWzxmW34tz5TJ1oQA8619L%P*vQDj
zU{n<2ZZZZ4wn2WM(cC-?(()R!vr)j?X^Dx5W4sTKjDRO&ts5=R*~tlJ>`^t<)S)0R
z|7K7!El-9JOJ{UXVuUa?HG2a*ma&;BEbZ-}rmrXN&-;RA_=U6_K9k|PyebZ<>qA1<
z42ousP_l4_y1gf~T?3%!6-L{Ofq6tSY~nIulb8j&lR0om%Y|b)BW`9M+!=9w3QOTv
zQVxGwc|dvP-)%3nit+Vrg;h5oqNWjHl>}c!JtC`{5Zlm!#Ad>-tq*CP1IQq#GW&<c
zO79ya^cY{o;VW;)>k)cFMbP#lX?0N@gdZ!it(*6E^M0YaxZWv3Qi!}Y>l<j=BJ?I^
zkTE!hgbtcsZ8HM+c>-vkL5$oXg7GmnhX|*?jgBC8a2Tn4e)_}$@)>Wl$5si;8DtKv
zB71n1U^s_?E3~b%moa?(4kqv2!`$OXnBa2i!6S?_c23jG#_v7F(CtT<WK`|H|ClkA
zKzs5Oy<!RL1RJ3y9E9k)PpA=i?T?<KjWN0XAB(STyZ1<}R#p=`W&w0{H*WuZS$*{;
ziqBm^*2*@LW>%3hyN=woOQ^hX3-!B<xU9PK*U-SKKQHXZ+eOXx4l1@TqHuK!xddJw
z?YL}q5f$t#S~f4BcKHnISI?sM^d`!<T?H0a!!9Nb`o7`N^$LcTn;(?z-66-0QpU_0
zk_P&a)+g}T=8tP=Lq^*G2jx_lE_C7K=?CACFcEmhmgcY`=mhAQnOnlh*c>wQ>iGPC
zEIv7)C_?Z3y^8qF9wod>yL<nT20o(gy?a;>|9r?0|9sdC?;W)!@a*tD<L#$3y|0vf
z@TE#P4(X;q(lis&mRXRt$%X=rPua5)s=f_S^lpM$U^g^^`=R331x0rn;E5WjI#uDF
zZ&yZex3?0@W$Bo!D#T1j8wNW%G1%SnX0S3rHO$_yXJi67MWraDiEYh{;e2x~MuPk?
z;_Ql5Zq(<wQO{*%VV<#gu&o7U+(0J!$Hh~x`dj-k%YX9Z<RqqM7?0OZi_3-iIm|Lr
zPYw)Yc6<W;gz98XEzU6#->Ry{W5(#4*%`Q*nTF>jCAgZFh8Awv?Jlm^;`=Y1%*Lc|
z7`|l#|Bi8Xj)vC7=rTAqfS!R$#@ki=&wqJL(4E71_HK`FJ;9%U`Yj#`5WKpJ*Z1$?
zuiyQE%V*Ew#p_oz!+Y4-SivT(_P1wN@SpcC<L5``@pyL;XK2K?$A@rkxF6YZQR3-x
zg#|e%JDG+cQ%g81s=!A}8$o(HNHH}=pRX@gBciY!5ra7|Uz|xu#8hM?rsCo;921N7
zurPG;zE;AnjIhg$jYC3oj5q`(kkM8+*H$<S!`aah#s&sZQ&xc-;d<owah3!WmDNy8
z_yh%oFxCnuZ(E25F@(k?jE52qVjSf2Kjm{jmy*Tj(u(*}P7R0E^w~R`L(#+$$`&qA
zxAlUyb0G9Q!(kwtNFJI3%jgVP#}RtTjJBzHa7xdEGoj~^QwX1e68Nzl_zSbF5PX8^
z1y|J2=m<PkSXI3UyU=n*+tONumewJX9aCH#<7`t8(pm?Q)ir|5o)KjAjUtOimeDs%
z^Xn({x(K~?5qfWm0A$P+_s0>4LWQ@!sn8b2-<CH~IF#)sh9E1LnL*9!I!fmjk;P8&
z4fI;zNAUU!;G=B@HZ~%txfvntd~X_NRBxX+l;z~;3{pqukv+MNll_YX=uH$(okR8M
zD>(PVzv0GT{y(_>=l{U@AOC<`|MG8m{(k}Y{^NgP>xW+m!WS67`y7)Gzs8UNKbl<M
zqbD@E$LM|{l(0^-qYZZ7drpgchE~F@nb2&$N0WP7O?MxlnI(X(@iwC@tB#<nxqeG5
zVZD~ntGRXym9)5=)$@oSKaJRt1*A-^A%FD}s<s(tFWf{Ef!9nh3ieySbph2h;8FrF
zhmkguCM<Le)r%`ApFNEd-maweW%KvQ85l=OOD~*~GGP)N4K0rVXuA49#laN{Rt_TY
zq>arWWuS*+TIx8$PF_+|mmRDcjwxtBP0tj*!4Yuw@Dj)2nK9A|gOJUI18nBj(9$u+
z!J`WJaE~mbts*|yqk#9nlE=GW%D*|FaYz#%A2Sf+?r)Bm;-8LM;N9aiJxP0fB<+aL
z<y=JQeXbIL1G*<6B|y(C12X36kSF+5XnbnEjZpM$fuesG)CfK8&_U=2^+3<N8HOG;
zc;~ytF8r|2jx%+cj9Xcl>}<k7dkcEmFlN{b&d|{MT4-o}<LF?tsASJR-_wgrl{7hi
zxfw@KYy?E&Y-%R%4-Vq$>LS(&wN5suuHHcm^pDXRM={Vhgqi7SanSMP)F@^aXL0W0
zdECE#6ZbD)!`<DhxVN)|2j{l2GuV$yRh770SRk6+b;9LwT@^Ny5;5fHinCF1*hx>t
z^{g~J%3zh3;P*FoaB+JDvkMcLm~O|_-Ddpt_fz;^|Gti2eq5ztHlnw;3s*00VvBIR
zxwVRgzD@$|BJNzdh?VoJxc>YWZa%t(7f)~FcTex)=O?#u@9qWMy1k9l6O8pk?da*M
zLq>KI3d(7n8>`q|SwmTTGW^&Of*3QyOpOr729c+$gElh@EVz1L(ajUf!6BF-%*KO*
zF-)*E1_q*r;42Yki3tiuOh6#Qf`Z`BrI&ETRsi0K6L7G#g+2kMB(K0oD=R|pFl|p$
zTNm{-7khh0S{@q}<E;Qap>Y`zfN~ld*e5NGPx#!=BxUfKlyFwY8|X=Dn?c^lp0;-a
zYBn@ICz_sn81%iPU>uYv4pp#<$%I|f8|a-#&xdPf0X$9>!8@<yzra^qh5$z0pt5R&
z(CkCYYelmYD!iPQN8p8*)FY~#rdQR9gxU_IH1;61rJuGpi1hAZ0&fKAz5fM0VTegM
z(?%HEUs$uSY(Z#q>)RsW-d1o^D}oqVh1eP*p8wl`WTumnxdno619{^!NbVj)SY4|K
zJzoOZucjV;b#?HsuV>5^4wf)ZvvW^iJ5L-OMau9bvS>MJBd1Yw<_2b;{|Rf~|2yWN
z|Ahv57gftwP`_~v<M+SA@SSfka`!u2`5SQa-+<F!|B<)9M&GT+7-m%Md-w?54<DfG
z(F1fodW0@oT-W_)=(_s?9k-vM<<1+66X5obCG4~N)#yaP5q4GAZi-dSb)f|4RbRS}
z(leKEYW@r|XV#IwL_lv}L;IC`=)3U{?H8}1gO=C6eUT=24)yDss9ah{!So`sN2ZW7
zGKu1;dE|1L-Xr`LL$vXB(Z+r9N?{Tb16`kBXuA1A?SuyuY#bqLZX-@dCTVDlW4hWn
zqA9c|HAraa;DD?$j>@aShR+QMjT9#pFemgZtc4kWgm!5SLlaBLD{0}=uO#r^7t(l-
zpnLx-1uhlv8`gU?y^oLRvAvq%6DdoYofZE1s1>1ShYzHj@QIu&zEJSMmrB0)Qk~G#
zPllvfx>ypXX^^(cfs#ul)P38a?Ar+i?+&Q>_rfG<1a`4QaE$DPb7&*pnJ$RJg|<_e
zC?`;AvI)F8^mevlu(Ja_+^7bc+R;(dg!aZx^s|9>bkh9V+HkXX0J{ZcxR87btI;V~
zOv}VhUoTb&w5j<?bPf%mpN2Rp8s0okFRfzhEa7o_0atHa!Go7iaPP@OynOr=KR){!
z->^|UzjYhWZV-Tskh^WIjIlYGkBh;Xj8u$7hN9cs3+rLAxJ&EUijBe!<Mqq@Q@9~O
zacvQk!#x-t>BHDq6P`aD#DD&64!{4Q54UbKqpB_)Wdv1MV+Bf6qLJn2fszn^OjH!(
z-kH-F>}|l@(g@D&tl{pxD|mGO23|gYgnQ5K;r7!zSYDgM$XE~KemzR-iqJLGgL@C|
z;V~O#D>wcyXBPz7SR>rp9C5}5$kWq9GaJV;fw$!ofK@+#%=`IafH3P~bZrg|MJYj;
z=It%M)|)Zb&Dj~wv{+#}E@6<dZ~()a0MjPG<Yi>U8CBju?-)#JcXjoRjJAvqY*2#f
zy*UMqjf()3R#nAbDJgt>^cX&skp3^|sWRSbnL^gUmf&-QiiI0A9sHo<8UlT<NErFY
z!#wOHtYflZpOg!ij3T&Y6~R5b7+yI99ybUdg3mXf;42{b3d;~!T!~OZFRYA^D`%Xo
zXhdXrBO*%c5g|aYlqOc*g4hazuet+C_1#FM^(8avrZDQB>>5N;XCGn(izCcJ8=6^+
zqK!f|BbX&D#dSUwBox;J2)ZDqf?zHKYikfe<2%LVP{$3tWNwKOcN~#To$#l53efYZ
zs)cV&4SZ{B#n%ZSjqIddGR?%&_|nFv2)-%g5hP<ze!}Wc|B3M@e?;Z_J){nwMbY97
zl&#!B$<l3PjPDSTmyt1g5fv+UvGU!&WBd322L|sEe8PkP_aCC`!F_bFjkF8P`;Wx2
zbse{#qx}}4CycSXCG4vs?A}(x4FZo5x{g(IT>#!~R1tEO1m3Sze)R^buG~Pw<y&aH
zbQ87buA%(QWt4ARM(y?uwC@VD#5_d*)q5Dcat8xD*U@`^2kiu3!}117r{^J@&X>(}
zBp7yLTQ_1F+Ywk=jrgW6gjY4fCO#FqexcCzqUE{K@+cZ|R(9f4WKxW$$Mg(vK%Hi%
zqm4r<YB-{-j$=ydkWtk_5R;9wyQkRs3Nxt)!&8JKw<f~$QM$%BEC9PeM8BMJ_W5zV
z`<V>Er%322;XT&7U#Z}|{Tl3~jqrtxHNKE{z{fHUcu&F}zd7!J_obZ0eR*Gs(bgY(
zwIgxRFcHU1Qz0QhPq01f0z$7AY69&1dZFx1)AQ+sdE_8Gl1C7nHh}2#ZoCs|uY<Gz
zOQeK3p|Ug;H4Wux?P?Rpz%{a0>>=R#y1LNJp0T~V3%x^q7#Z$ES4{<Kk`u91U4z-8
z5)2jOqqm|IBZR@&+yr_?hcUp&x^Q|G%PSk0Sy;sM;(|D>(nUh=;_a)r{@@OtKYxzD
z{QL*}^~c}gx6fbV^~3vkd6n^#QRONd@tK@lETyNR%F`9?K>@f~SdNFq)i|4;j@|q`
z+$b!?V_L$6sVPj3j_~i(OnV5-^)B4LIf%2{wP<RNB_Q%plAnp3xKM<c>LFHJ1G&bA
zsIaxdQceaQOwv*(2Jqm_GS1FT;r5N|c+AiC{M+yF?DaReaPcDM=cWnNehiKFV&lSD
z+<o#08*A&xPECV{lcPA1LWsQ;;w(&1V{eB}{?MmSxMSPP4_m$g*kTl&bah83!B*oR
zfJ_1}+`$2E<`%G_C0Y<xri7PJ!oE3UV`CT_8A4q}7@sE%X(>q@IeJtCpPRdfIC$LB
z(wde>;IWYjrYFo=p=V+O2_+?bB_Sc2o&dd1CFMov9a7Q5aZM9Q>smub-wsNqjJP&F
z&~gfbu6qQG{S#mjnhLwPQ#f&wc9&HO@0@a08GLff2)lCl<(I*q9iD%FDFO>C5L!}=
z2*NI^vJo+r&0<BAHi*EBDCYeoyuXb1m$e|Sq7CuYortb(Lo_3AY-1PVTY3=NO!%;a
z6@VAa7%Cif4Hk;<6Do*+4-(>OEdf`@;%&j&{Ay}>yAA<0)d(cegh>v{PA`i)>a%Dx
zQA{ZT)s66}sD)QWHM}dU;LA4a&({SAN0^1Nf$U&~BfCjMLx^EIX}fkGQ?GtO&H0Cn
zgV#{F@&M`6H*j*|I?|?YA#3&lBk^sdP27i1^=Y`5OrU7~7UsYH?^yluf1sZcwnv!n
z?%rLr-@SvjyZ6v~_aRzuKSt|K0`JCCG~M{Gadr*Y1=zi*+G}@EeT}fYO4zXk$O%<;
zg=WV@RKlvbd>yTPUB~qYXxY7o>a#acx^@*6XBcZQ&?+xKL;sbhgx(_zT)Kmv?JH>8
z*hU@WZ1MOMvigURM!QRB>p_SxXJt+iy!czjH+Qm=%Y#91B(!~lpyBQVRkmM6TSvx6
zD@dDI(DF=hSW5@{RMl{lY2%=ZaGIkcjw`9b!rloX5mB(Ru@mRZGq)h{EX>3<X+T4i
zQP#%i2W0TUmr{hDETN}F=n-~%RPpW}b-ce{htbv)pGn(dk20-J(G4HTxZquh6NH{C
zK2`9>SE_;dN-Y@s^`daVAQneWk|1eL;92ED&aM=S&J9rU?1ZNOAdRmNS_1S!yWpGL
zkCR2?D6XEsJFbRmu+@`?qqPoFQlgMkl#g;YqQ=%Xaq6SK{vLD?g6%zwcEi2sXY~wp
zqPDRTMfq7+85zRrzyMYT2T)U(hsLH_jLl47eq{+Q9i5n(oW=Imd2DWN<I0U2*b(OC
zyK@~6zj=YLfBFtDzkP-836`I}{szB)^$mV__7txOoQH(uoxVZruy;ORQHkE<1oRQI
zyH#~~&L3=-8^Yt!aXcNJ!Ncj(SRb3jEX{IebQT5KdFX7bK}SakN=jppl;DZN%tT~_
z1R~1Z5J`HPDALnKDGjQ@&;Z@`HaMG>jOEl6JYmCsd1ez&*--C3e2hnrUgFu4Z*b$v
zEv#{4TAZ20`s!&ub|2U7+(uh_JN*27VaNAyb+Ut>gEfMT^^s|0gnC-ow38dw*mzdF
ze6bo3jA<`#w7a<@%jpEd?d;)UW)3?;BbXT&!hoftuMcfnq88(^x`sNmG}WObOd2XB
z0Z9pAW|gCmkdze9*hoxDfgRsN*r#V>V=K-;YeH~p8W`Y+oE$zU@IF2&Ku<=5-j}i}
zgq|*rX&6IN%M#K$g6*B4?Ri4OJ^;FI;V|(_fMr-3920WkmPVt?sUqB}S;Dds0R>g8
zN(2{Gv1oX2ffrNVjJO&CjTKkjDnc*3qydoxo=_3RjR-H|eXIzYUkEF-f)Tcg(5Y=j
zcwHN#XbXWyzzJac6>e`UfYCIN6;RE1S|h+tFt-{JVt&=tEW)g^LOc&Rb7TyKvx`U{
znm|-5p~s}+U0wsvvPyWBSHQQ5--Xcgt*RzWX{P)Q!&;kZd|gOqG8udJJ!&u9MRe~H
z!aG-yvv?o*E6<R-`~n4QuaUR%8l{`RL+rq9_|&Y!tz-snMWd)*e}waY`v1}re?Zsm
zhqSu81l?^K-5oRvR(I<WZSDyn_ZamAYaIbsdzD7VwpGolyiA+BLYuox$n9Q7$u1!$
z6d_m4rBK4U0K--Sv-9R-v|N3J%I!M@^G)QRzJ;QtyR^}J1V7*F^i@>OUqJcn8I(?)
zM&bB04Q~)98R;V%S`fkL@19%2_?Jld@&8O)Hw=x2Za^rsy!@f+;sGW56SO>AND1@g
z36q2B8RC$JHcd|x`&BjAo;0vmMjo=f9Tpvj6RvLJAbD|uP?j)+%9I_mmVr4ANvY!f
zFBxsWl)-O4m&3cCE7I~*XnAV*%^pp>cR&vx95Kbm64uzG<c7VfUiebk8z0NM<0DxQ
ze6H+|eVSp|rxlJv2GKZV7=vS`N#cPFIom=gI95T?r5VZ|ozMyxhL-;TbQp8ZgWBMi
z)PwBuDKxa7#yf%hP|k+Ba5OVU05{0Q_>(wwsu-oEHK?cl7t-EJ>&ns6+lrRXW^{G7
zp|hhE1ET|IZEwcH_!!nECUI?Z6SKp^7#khJ;?e@vHr6pRGKz`eaV*cC#_emj@Pa1y
z;K^g$es~Yx|L!Ne{NXj8fBzbP`j@}pcR&7!AD+L!w+|lT^|c##b!Hn688@yrb>M1C
z2j<JmFkFy_1vc(W{NZ+{rg3Rz9@iJwu{*zlt@(KjGpY<V_aQanBr=mSk(ZT&gxGKd
zdpIMMKU}7*1IkQI&>|dYARxLKKihP4&`&cu>mP_YHxDdR&wt<s^Sj57ap(F?+}pW@
zCs!Zf&dyD2uWn*%brn~4F5>CK2iV|xdRiLn9qnP`Xv?JyoNdhEVW@{_f~(HS8I!(&
znDq3;psNSE7-L&pTu{L18fIn&H)9iaIQlSW8R;89M@JXxTG~)0j8zyPg!60_S#q*c
zkdi!(<HwGQ(32(5(las<#CU9LYbWm0`xSa-jJm3PpZ!v_yCcW&(UIf$<hV3Gk&qRk
zcUVOS$J7lWMd(Rs+dx*|5sId6P_t#kbq*!;;$a=02B-L3c&3#iAg=~N1+=L`M#7?6
z1Q)-lkm4EwkCC>tRy4fWnpP2dv4X)>5@dqul?oA8C<3mC(8?n`b1LDVTg6Dsjf{^4
zmNA}I5`u(ga7{D8)<{SRM_UOvmcLL0oG|RfpB<oICF3c}n<YTciyfj@g#a=EXl3vt
z&|=z!Q`RPt%=jiiFPJbDOwY6Qzn~|Wp;(N?k+i%BCYtEZ9yDFLj>e03aB^&ebskyM
zH<2}SkA`;-r%pdZ;rcg(-`B`qe1*KFZ;-d}68UpaaccY)@+PmN=jwOZ`osT;j%$xZ
z8*9ArW|&DWmo+!;F|OW6H4Uxm>Ky`(CP&bf?Fx{it+7f7yP}<IVijJxih_$*#O1G5
zLLgQWl+A)EUVVs)^Y@T_`Z^NFFC%7f7m+=e5Z1np`2I^s=-oy_&k7Q{W)a^tiiGX~
zamE|r2zemmzAqD#HMbMHlcx~Q|Gj-mCUpEmq3a(aTAq@lGZeVZ$XVJzQUIR5A&$}R
z4iRt%)HJbIxCDVB+mxa*EF7E=85;*PcDgp!OdT{lalkx*XJ}$2P6hw@VPW5$6iXKG
zexX3qQzq~P%hSNS`*rc&AtOT13?Ck|#2!Ug!qpdtGz0LJvJbvstlg^~gu{B#IHDhe
z<0f&8wh1)6G|1cMLD5ODylN=AHAB_A8#=)w&<^NlOz(hsa2q^g+mV<%i1PYbyc23<
z25$pnIFaC{>bmeg;e)8~BxGh3pn_JFU66~M;v7`em5Gp>Vr1><>LCEzP+ebxiIEY^
z(YQv~VCKgraAA8J8|!PBCxk`@hs2|%7YK{Bxzo6OVF#~Yzs94dkMYfSukjPX_w%2A
zkKg|CGw$5JiRTaQ<97t(_iQA8U@!P$X#+R=M{%`p2p4FR56-ON;e{<+*<8i?@)Aae
zN6}g<46JTIPg5hI*NK7VUX-MlAkZTip?;xAjEzCCn;Sw+Y>;lixXB-N%i9kNPA+Jt
zruPyyD}>Q5;WVgggnlz?+++-TwRHxU=7jTqX?3U1;?~wBTse0Mx31s7>u1mK;^u8k
zclYw|c*2}f)Y`#@8qcM<G2G2f5kiob^07uwA5^kIl-fHWhdp4HqXR#S5!?+8VWX=D
z6GF~_pF@YGuAv1b#s)<d74bS4lG2ipX30nkmUj$Cj~o`Er>Lle+`N2vdHaYlLYQGi
zJea{aZAJ4_*4DwE<C6I3&=Gua=qMwD6m3ryU&$(q&^u1(NozB%2+-5Bhk}U<RIR+B
zeIgj9esQo3Psa%ndi*~URKbOH2x0{n*CCWJ3oC2jdJRHK-W;8csAyzdZ9xpd7bTcm
zIiqM1VJSkgp3iIGI_)fvHkMlnU%qDm?a!YTK<EYV`GHkU2&`tkg`U4~(BZAo`Bf8k
z!u~h`bo_06DhRuB8Xlh`n4U*j8J86_xl-O=j?kuNqzsI3gC9aPBV1@h8-dvXpK`vo
zoY18i3RW0Y&xlL#iMH3;iMZ||Og#RcV3@(lp-p6t?IL^fCUR%*qj=>xP3kd0_ZY?N
zFIcaTzw{hAvkwWAiwLPX4Zo6cr1YG@?6be3{qj>ZUlGonxQm+WcTh#>m2+8sgC@ph
z8NpRb=#>z9MTDJDg#=zcVJB39uq5R22)+D^TwdT(7<j!y)4R;~yLcNVXBf9<FClX9
zEW&!vA!_IvVuo%as_!ad`gRf3y^V01O-S<uf}8pg!l)lakqNA4CnQ>48BC&6Xng$t
z(AW(^qM;uY0Sym7(ef1R97W5MW4k&|;2mQ7JglyPgR1Hx@D8YIVvno>q<CLwY%Dx{
zykW*|!HiO2WoZs;cEFZa_R!KZ$6;x8e7H}TSU^skT;Sa=1<ND!_Nd{#{n~i{kOAI5
zVnX0q-~&SMOL-?8)$)aueh3b0((qIXK&=oQH;93xaXjO85@an?p=fssDkqAd<XlF}
ztA>hK3p4_Hp&vd9t-xMFyaT4e?Qn{0ML=Q)(h5fLPMV21e3VpRLq#=LRD+eS8Jz9h
z5f&PYyu4yGHP)l4p&G4?b?6~fx;p#OPZ0Idwu*`hk)M-;@xft?_6%T*a2gvN!Sdn~
zR+m?>wzP~(XST4lAdI8CfZLa^;=6CY#g9M!fZzZ2C;a&BYkc?WC4T<lJ3P2|8&B@t
z#gC7l;kWl5;|Dg1AK4(DZZH;4&0>{Ndv|Fb>+|EdaPbVLRu)mxz!=`#h_Rsow6OuV
z)YhWCu><+p1&9dngSU$f;$!_0ALfT>M^6;l`(wz~8<+edF=clGRSL@JH8#bHyBE&5
zd0~O~joR3w#@HC~I_gN{57+Dyg!M8;>Yd9pyi0iS_z}J*xWB)46YCv42yt|RJ*~~k
zN;n0~6jlVD1Allg_IN?;>B6}TH8n+`3GX8a9gPiPZKw}R#%nWLrV#<9r>PB1buFkc
zF32k>LsmfnGV*U`Q<0I866^T!qd0J2KbI0>3@#`vg5wD%*x1>M`|_*^J!@Ng7?_wr
zo;~qr1m4GokK)6_$ME413EG|vzLHVKAtfywS0nUjdU6DwBIB*HnJd(+JfU+U2<HBA
zu%qcYMrXl2rBF080eYc?S2)c~sL;}S1PP|cSQ}2uiy*MVDhNVapMMd7!uac%Qwkp;
z4il6?1f>vVg%Z|%3Tog-)AJ(?{rG-<1fL&YFM!Ff>do*H;h=+f;DNvsOSnzgc@ta$
z@I1Nn6qo#4v^T-_gz~2Oc~?}41C~=6Z!<<_kk~!U$lMHH+MYkb>CadjP)h(-mLs@<
zz-wuzK{g_5^fXH5F2eRCt+sLk(M>Bz>$`-U(Q7z0b_H2uJII^8isGePC=_b(7UDYB
z;9EKh_fvgvPVYcO%?xT*?xXeMGc;U$h??EIsJL<)<<|%)wwWTPm_nhh-b4Z0cOD^j
zig3#&+=OKgVJB49xm~fcS*Nylk;BS8yNf*GER5}I$X>XB#NiEi*H6NyX#qi<n+WRK
zM(pr)#135*_dh0&UP5f|DuS9u;Z@xUPj*ONwXN_Y@WLB9;grUXGp_=X?5wQfQVG0B
zXnF@gN!X9);0$>i2g1dYpfM8n-yPA=6c1bommWB%qQ&H(gah(QusFda6%!2;wtahh
z8<?_FH)or-CNND*Z6U8LT<}K`@9&Wpr;hq>U&zz&6lr{{y@cLDTHX;uym!<D9~?Kw
zN0L_fLe>rv+TLuN;ZQOShqOTmqz%I%XBrCyi)1KSr9;&&2O0#PhHDwrJ*uGYT@Q8t
zR%irvLO*f{I-z~g7S;n=VG+~<=g1BOCJ*49L^U-8awBt7R)(3p0?k7g4t7om4~s%h
zMizQ%tI%0efYRi68dMruo7>Svzzq!!h*$9{CHPv|vrdkTV~UZYyS*FZG{2dd8JgZ8
zHcp?$S;Ft?)>+)Ua+RR_7B8MY!BYb6^@9ia`N>l}zi|_f1?WAvi*Fu3!K>T%@Z+6_
z_?NH0!;2fYvAep4@%A=M^>m=8xti}ijK!@rls7kEbZQLUw9XdBl-9a>loVGYKfef}
zfnM<Ov_@EjD<Z<YkQJ4Ln$T1XdB<Sb&I>JuW+;%6L#d1c2DrXSou9O{#k`w4Dz!D?
zzvnaL8R(+l!yAj~S)ye=zI%hv{u<vtdW>%g@$HT-BzStj*^(-6W(;Ej&CJ*c7Dh&}
zGBAK0ZO)NpPn)!1Sr{6@fVcGw^kJy014C_X=xJ#|n?0yHdrc)aCRuq!NHe}l%gN%n
zBn?lPJMZv89AmUSbm)*crIZt6vtW7-PL6M&Crlbj=vgu9>Kd6qh7I#m5qbpPp|{XG
zCWAeM-eE;TPt^cYn&yzzwI%eNpknR{4O=hhIS0bjHyT#KiExNYhig(UyfaG}M=KGS
zUxg5wTNr^SnjQfcT=u4fBY(aH<#5j_hI3|tIH1|}R53jB86^ovA;N}MwIYnsH-sCW
zV0Yd$Kd}f?Z@#BbnJ}z`C4i5n=fjloEBJ&c>s=`vh^P@^=TS<#W4SY~x|de4%4vJ$
zZ{b%ZK0ms%52?db$emt7=Ey9fTYC{uMY|;U0tit7c!3T4{0$9^d!2lI0I9w6Na<XJ
zTXs8q@&*uCG>)+HX+%~}Bf5TykIf@*d;_Hm=lH%GeBTv>)egZWqY;kD{2tlea3lC)
z8yC>H^#nC%A25F2M%m6STHQ?)U%i3+%Y@s`4sv%1QUP$ZyF5O2>hg7DT)d35b2~^q
zdkLB6cag#M%<Uayu>|Oy+T20L+IeK0K8Lu06~b%=wz-`!%VK=1okln<GpcV3@q^n)
z8$XZC$#X~^T;_KigllmlP88O|y{v^DO9$<`nVnJ<BKf=c<dwsSlB4Y#ELxtDlbZ-U
zS;kr^V>4Qw0j*01hiG^Ql~r+ok#?_~D)tG3k;w~>;9xkpyTX*6zNN4&5PrfzISX?e
z=;~SExQrG)+0Qsj;QeNgGX6=hyuC`aJVI}uI^H|1N8lOby<?^Xo+Un$vB6hzwvg0v
zhqhG&Oim=iz%db;HnC8%j)#U_Dzxl#pzTxyUAJ=Rc~wKlrxrQ^P0$Hxhh9Vv3}XhN
z8{P};;7;fUw!<)>4Hm&2aER!|JJAZV2vJo+fT1ooPAxb%SRpVh5HYclC^(sbx$08P
z<Yl11%?>fPW~gA?Ew3!YAOSQpHi+iNS`-%*ppL-l?;k{8-vA~ir!YJ=ilLDq#^Eu{
zjgR8W)+TP8KZjdaF5~*{B|N@)6HjPQ|Ks6v{KOyq{_YN5K7E95X?{0$ckz@F_oo-H
zaPj<k@u=y-)F@V2^S$j@9UH*X>;xt!M$z8Zih=GPbT9_BGVW9}B9(H3Ny|t@;>j3f
z7N#(wr=fxyM`w8xw%EAm((};k<%42I$r4F<%vw2O&cYtO+6Fii6pU&u4I~}fj}B`~
zECh#OyD%TGcQ*0EliT>7aK5v=j4eWVq%aRr?oM#DG=`apK1>Kb3)-A;P{NqeRVX7J
zUBXS55mpy^ItI|xHGqbW9<)W+sY8>cuBHwZ#sx(MHX>Ph2$y%1l$OFViQ}T}?cKkZ
zvGxezcbM>#Ktx0o($X_v&!}u|Ya>F>+J^9BX%no+3C>Rr9$~zt>Ai*Cr^jXg4!vV4
z`jF5tg|w~}6pbCAYT*J+J5NH-A12=6w7nQYFBPtdr{JAYMB6JP^n`;K)ncT5V{|mC
zf>O9<=fgfV8`eqbuu4paO|r0$C>ze%MR3nAgHK5fH!xx10pVap2V+$`g1GKU0~E@u
zq@KkkZQ8qn@T2MZ(Dc0d8w%w`fO!&j9_6eu0eIDLFCp+4$HXG|-1ytN3E-pUc?$da
z__~nhPDa!zn%OGyrdN>CH_pG!7+1lVUPt)V@w>3o6rycdV;kbzhmqd9fTFQ;h$TP*
z^7;{+KgeP%EbK*CNhe}!y2XLJX-qr$6Vq=7Kr=RmmhgQ~G1`|7!#TMTAvC|*)%&PA
za}O2UcTjrq7K$%jXMDYi{N2mQ-J!7&a)RAu2ykPio!>#q*^4;2eF^Coc5!lx>zfyl
z%1Yn7h^#HbZ}Sq;R<?0+ZVS=f%dpFAgK2Ud%ud$9K9}&S9zbaGDAGrkkUB7p)V>MC
zw)7*Qv<aSh1btBp4Yn7d)qU{IuR~02C%;cWY~#|Q<sA%lcOlk#L7vhhZ9(9fT0lay
zJUtv@5)lmVfTA+?%PC=xv?314t3uz}4*ro5FedCwE!p<XO<-<rCXOmFG`50*iUIZ>
zR>u2ZGTMI0Si4t61mFAnl<}TW2Q_JV`grd+El<J%pAvX`<n3`t*#UAo?l5(TCREw^
zh8M%$Hyfs|X)t!phLLL#jNB_=>{SCJzd9HNHWJ*eFplbmNnAh75{6(D(+|C{Zs-Pg
z(PFz{5YPpqfF8V)psI{uHC05|S|Y^b1ft`@a56g?rNueuD$U2;fp$FWX~bM|I11gY
z(N>m^mYQ-b_jhByy#?Jg!3Or4Ia%3gY-qxWV0prc|BLe&pPj_i<S1rF2C=iUBwl0U
z&V}>1d2SmoZ{EVoOPBGV_nzWk?>@$p%e%C|ukrNxV_dm*89V1L@b-5&w{sDl!rz*k
z!1n1GT$&ugLUSF4Yf8m4Lz*heG1=3LK1Q3;<YZK{@mI3pXJzN0qNW;ceQjv#Z55|x
zAL{PM=JYJCj*p`)&<|C#vKC!KZ2JUb+RPRcmUfu&@I<bHJPK7*FdrI%E5(Jl*Vloc
z@9yBIhga}oa|IiW`Zb|}IOXF84-0)*o9Mv8SP$k#2Cy<ThN+G|jCA#(Pv{8;C$uzm
zpslV0Ep=UJXy`*tLl<f^Y&HJhg*oz6R8*m)s0w*DCK+itNJ&ZKxP-*tp?BcW0USJd
zkn#2))M$hm>Dh>lix;njA)IS#VPz>o&&<*a>U#P(BqNKD4;;pa2mb+jG`+8+m2g-=
zlhD&+FG?fSvxJg~Jyb1EK;6cb@zxtgp237(B<v!R;GB>_=oKI!w+sRK<zo5dl)&>8
zqjhQ)Y?ISr9-j<jVb+c41enDMS5M1?eR>W~@HuXTo;Shd#~3W^vkR)G<yAE^$~MA_
zcIQdpiB(Df5_+Blk{4~yvyw}K%afMpLEyO)c0#!kcphbhU#U=hjP>R*nja0%qmq%A
zukos8M6Pc`T+b-7CQc)FY6Tg?v^@eVh@GNvK5l4pGfl5f9Q+*K(8eg+hrHnx%-;P8
zxjk#hY+XW9%_O3V`w(8(PN20QzPcHi?VTtb8bs;j7@F7DFmT}t+E*_ky=e?ysdcc9
zC`NEGQ_zJMXub3t^_L!^Y@3EhOUvE5ggn|?9wC==?lOzkcb;|ODzf-K>5Q^zv^v4`
zQV6@`jdMs^-$u&%Iizl!N80*%q^@owm8Kcjw+^?W0Ya|^I#Go%KUvP`--gioF2vJn
zh12|sMyF6j+dbLL?@-!GkT=2SR5Ls?>*1GMi}bc3@hH1dKom6G{h@rq1ImtW1RgC*
z0G^=<j<GEt7ABrm(ZGJT(S2;Adzp^*Nhv@|RSV9({&4p4gpmb-XJsnx;xRTcg*mq+
zEnPDlmD0o~jJ3b{T!0?!Zm+5cz7O`R;Qjroc<+!l-al%H_a)5nq12o4dHa=|XpnBu
zF!F)DYdj+2%aD}WjEH3Z=AlKf_bY&<R~bybs$k+%1CxLTn1(dNjPcbXu8$q?Fl<ha
zv27B3QT;HAV1*69Fk}FF0sVNVm_$!u<%Wi#BrAjQFds$5xu`A9!w{i&zq<jyjCbNi
zTNx&jLor!bipho=yqcNDR(Ux_3kt9#T<f8t8eJ{z=<6N8@c0DU26|B{TtinlPj{dP
zr@OmwmofJ*4<6zhD*p3JJNPZ5>%UyTi@)BugFnA`hM&KCiObh_@c89ZTxM_h<k1ss
z6OyYNs~8^cL19_~x^psdW26sH=0|a%zZKK$tw*ZMac!^@r>l$5R*;4AoNS~e2-om!
z!_d$ORten87q+pzx`?5kCaey1pgk%WEhijMt*eVF_FzrQ8W?80EhnflH8jxT?t#mV
zjd-y<kEdss@Xf7lyuP|FUIcwIJs#PPmIyP`gT0;#EcMl4O;fhwd)a8|!&F5J#xy%!
zEp0+h0~%@?&{EcfmXapa6*Qo#s1D&OW=bkbP*PG74@@Y^D??sZ5i(M;kdlys1feH@
z@96PkIKpWA)mM9PkRWrkcSc@rp*Yu`Fh!4GdBQwp0`yGGEuhBFe}K^Z@W3H_aNx~>
zjE@e#f!-bwdKx&YtOIEsQ^*+-dS-S|wxI1<J3-UIjnE5(Wsm^91VS$j9;v6`C7gnm
zkqZ}L0s_J4Vv}JI8VSSjXqZIB!z?xl76~b^N=kz@L1<0zIcDX<Ifu5#*y~9v^enDs
zj3tl=EO)NE7uCS6xRzA|SAtOhnj3-UUe1QiDC_>V+!$xw2sT$f?kc>NaasVR2SF*8
z@O;4nX@qZzkJA>t38=8vJ|xq4GDjB)y*VUw3?rhxQ{0LFW}tH;VmrDK+uDW1)_!F7
zP2=1*e?|4wHVS(gi<>5pUO$Zd&IyzbOrm0B9L)=J7(Bm;>04K@c>fNTA3VbHy=NHQ
zx`FtL9=Ilz!z!u()(IsD=WEMXub}nX6Es}DPxxI!(dI6SwyvOH^D+w0UPT_4nQIr2
z#@L#+aS^F!E+TpD98xw2Jwh&t>q!J%;>uYhE^i^>^d^$#&mv}U4dET9VV_UiJ6Qp%
z^a?m-S0bvu8%2!sV>>sobo(*Z?mWZ7m4{fo`V8ae9wNJW8Ww@+2rXzp7NKV!od#`>
zAZWPyLeb6zO7<?0Hn$e9mUmp=2uIkC52|U37mDA@HoAwv+bboH{Y*$YCKm7$u8}NE
za$#u-Q$kOeO~u&E97d)#P*gRf<q>dS2*Xo^Be%kV3*l(3D%+<z-anv$_Ze;9XRLi+
z!Wth*+v7_GCt98}6m>meXb}hxpA;mWtU_UFFAB=~5uerw&#+S1`d7frhtTt_fmu)^
z%){DX8Ql$=_yO1_kHR5!g1^BCEMkXX9y1KH=wX;d471%1;hk3RKr}=pprfc1oed4B
zudP9SO%)BQ3`13=xX@aOR}<~{X}$;Nn@b3<R;>1P<GcA8JnZenN@ERH2{^&Frbfol
zOPlKI>BGqMB>G4Cu`oS`)zKlWcXs08+#&&Y8$Vsz#kZ`Vu3pFAZr{V7Zr#Rz{`eif
zeR?0)ZtUXaw=ePH#S1(Vt~ok9f+<Gj&Xzi~RTg5jtN^<-w;wj=u{+R;b3@&jt1ZRV
z)(Tv1EyGw<4u)H5G0``I`pPCW)iq&daS@joub<w%iSuVq<IF@a2IC`9Z*74VTPsxQ
zXrbT25`EUTXtcA(U{nmQboAoo=2^VDc@1CRy@J=*&+_&Z9`w~>COHI!_U4E()P|d`
z8f>*x;GnGqM-3ggY8t{`jj@(Srb&xaqDd>Msz6;y73%UTP?hCv!cAUL9<mCukd<f2
z%0Z5eNmf!0GLnq8$0bGR2^aD{c=(WbJ&dpR>=R8eBq&0hl}4Cy%2GJRhtLyD&&t*o
z#)RG*(~}dS_dcQbK4b2OhmPZuqtf_FQivq#gq}8`X98J63qsEp%9gY}Ye#51xwGda
zOhY2!M39|`Pr-@AR5);HEnJzNMrRrk%NP*_{g7}%FAAo-&w_?0K+iJqBr6pbESr;A
zu%q!gWfzLpcS6|TmscW|Bi9{rO5sR@bfgVB6;{Bxq!KPPHdg_1v^!Up3t{HUav}6w
z2s~Fp(~ZD$C+vO&rW@~fE9L9nzV^-Q2}Az&{&m77CC8A`KZC@saU^z(h;c8phR~~T
zK~x*9sIv!gt)0jePKWCp!u*}*7~j2*N=C%W;nQfI-Nf+rRZQ+&!`Q`3n7+D$)kk-*
z`Qj0-{`>=O{O)(y{qDEedh#8bX1CBVcb+D=jO5lagj99GC9e)njF$d&JxJmE<t>~?
z!RbpVT;4(P8UeUY>s!8ntd;Y~TBk7*fXR%o@k^UX5Fkg`#d8_Q+cAq9h*?-i%-jZI
zCpQqvIPTXv4xff$##=`A${v(WpGE1&3c~a2VeKCeJ$rwcyGO#zEeg8!A+Yj2iSSd6
zh#>Tm8E*}J!lCKn2Mt=jyp<En0g}cR1fDTttv(Ji*6ybi2*BGbtHiV*Ku-Y@3aW5)
z^Ma>;04>i9hGs@0^bF062|#OT(()u^wejg*1-$>cyf_e9IO{?@^J4GY2|^Fj@($DT
zj$7aZNjrQj<AkpiTyRv?4a$U`xpgS~1JaOsvJw>){iti0KyG0-LgH)S5LCfLQVkRT
z8kh$+!78#7HZgs$PZ-9DlnFQ{Prxo=6gF`q?4*Zb$)x~3qew!pKQab=d4=eyZ$M9H
z7n&GRTbf$Li9~zbn=#f|jitdlT$t&=@<=<z28J*-IE?e7UAT360IM_I=<jXB_}Cy8
z=I05hDGc-s5`MinO&eUF7{gLWJ1%tf;MwdVet-TF{zmZqcIOIy*u9EBK6;2hK6{M6
zyn2pT_iy3p<GXnI@)@2!dW_q<*RVD*i;c+{O!RbMqN5RKN4jutWeU&Et>DV+FxF^p
zmxtT%sJ{+BE)C%7f^d#(69)O*%Ax`^Gs0|dZsE$sYk2kK6`tI_jpeo$4EXq=SV<X$
zdg>@PpxK!lp~>6~8;t2Mrx)<uh3j~F`4(<my@oqiF5%|f7_N0y<63<t<`aEUW^RB~
zZEb`Rb}kAEaF$bs7Y!;@#{#}OW-wtK5YD8KQ&oTrOF@~%*epyMASWjaNjch_tfY9L
zLNGh2<I<2iCJo7>1m00$Ox`i^z{UQ92UrKinN`e8tq>L*jX?iU@iaBzG#@eE3j6eU
zTY#Px<MmN_MMCc|-rIjzgx&`SkKyAZQX=#Y6MDy0bRel?1Ze|PDDXX1gkc2^E--NS
zhM8Xwtb)T~$97>86${G<+FWP^41>cM4+y%D2<Y>1W8N>637=;Y8T-cgViMl~F#aT?
z1tCZ)v`Nht%Qo$l2tivKqjfqfBOf-|MX(hBSU~U<SKvfxC7cN_7lP|e)$n#T?-z>e
zESD0&3~S+9Boyx}tb%h9Ut3(x$16ohx>YbDGe&z?36qivWA%pNU(tr}x^5)24kD7a
z7tur@vSSk_8cn3_We<!Xe`pL%OY2y9@C-9o?_&P?ebJUy?>)it-3M5?cMs>kevX@e
z_z8FZ{0H3r(=WL3^Y3xvr(baUcYnk7vmbEjn?K>~*MGp|&1V?7@dRz>Zy{%V6%ma?
z@Ga|vcX1bdO1co+Hc8+zdJZvu&u=4d<svdqZ{y_R7LreIA#sVIJAFn3UDV>bSP=_r
zh?rkR_}mI2=T;HM=pH=00H3xA1h!10c;O}rM|WToQN~Uz651?H^B^ecdq6?g1uDiK
zP%-s{ys-;xLXzMTmj!*#5UAMmemhslumhAdw!|@g6C4(Sr!8JKbPs{|6+7xZGKz$r
zq8M#;jM&ZtLtyLZ1Y<^8BW^!N=Eg8I7r<u;Ib~h!KT4yc<-Pa0B0dnt<GqF6`}@`L
z;Xy5?5IuZw)D#~|+Ta5zM|>hHOb+UX<ErjZ(eZ-0RWN+LlMo-BkGz~lRFwB2zpx82
zCmZ1qQVz3#3K$1e!927X)=}NCjTwMr!Wdjqrg0)^Qd}3xA#scV9D#NGFx%!Z-Wkoz
zM!#^zLt87x$H&nv%&yYejov<jYoHI^ea-0VuEF3yBPJ&LF)%ca;lWW%bTwk0aG9R%
zM+c2-_Vg^~=I5|HFPsBAis^|-oL^tV?$RPQX@=(*C!R7M{Jg%2Kb*gaZ#K5@=eu|D
zBkk)C&j`M!5AgG&yCV30di4V5H#V@jxQxpyn>aVKfGcb3xU#;CXIIbT^70JMFHB)|
zxEE)~`fzuC1g{r|@cVP~xV14sV=Y2qX0mt@^KJrnk)P@Er5m_T<rgLnJ<B-W%niN6
z*c`dqYA7(#L5q_ux;z<a8F^l>ZsG+Y`Siwp+`MuP>(f){%{YmkNPmpSd!xz47<oGC
zNM&q|V}x}hw44bQUwI8g>suknzyhYq>QGl!f{Zc&rz`_$MV1`FCPT=vj!PfMG09_u
z*)j2Y7srkebYdM7>&W55*uQ^24iI>IxfF)7xO@7-*FS`IXD@<IxX6ld<ugkg8`wHH
zisv`#7_)~}6ygJ+N85Xs%l8i+#V1E3u}4w?N90r?p`r~*Z39T^8bOYYSIyc1dd?m&
z^$vh#0N+0-j6G-&8&t4(0dQU4K<M}eK##Ze0z;t}Bvcq}F9JqF#C;1rp-iI_U=p1u
zmU%*|2tKRijJL~7SfymaA~{Qho^@7%2tE71L$68%oHJvt3uCLW6ksQm6JaOR3883s
zE-V3fPK?iD(H32Vm`vb#R5ikzHs@U}Ojgi~fa*3lWE8`<sGbHkg!raj8eb10n}kVB
z`;pZ@j8lC>sGna#ZqEQFXm#hGe}ik^{fyf`{gKO`aOveM?7Vt~+duz=M}PY>p8os4
z;?|%3MDYC$_x}2CxbpL#vHJ8|oPP2xw!Z%pwtxFqoc-OuVExCxVD`n&7`ponI(MI<
zX8jflr_a-Vt6`B+0CSd8K^;7+JK)>YkEo$}WG-Js&Y3GnB?#l@HW4|qfr#mKgwL!a
zjLXoeRRoPMAezP(N+XRNTt(UX10;8ELft(J3YKwDHIIghQ3#X`{Gn##2NgrYPWJ?)
zH0>avWs5`VmQXNvgU*RS$QV07-pqkZ8ywd&#Sv{`OrEYd4sWl53W2A9uVm!0M_L~H
z2)zS@o`Z`g0@x0W&CFn6Vgw6YOX!;z!ql3UM`#|G)5I4CXm_8=iqLysgq|`jPnp2e
zVEWL(2Zs&tp)f>6$`&6=zk%LfC3hTE^Q7td5~g8ra*jtpKo;WSE0B@ZhV)aNh)M5&
zZD=)&18dkmYsCF|){)(?6QCDA3YX+5I44aId=qd^o`g&46i%F+Wb7S-ZPF;-8LO<q
zU~?0Cy4o->Jb;n05%l%<vj))7)rn3<*slIAw0F0mtCv<s;0^W;V~{2|$>$6XkD+gL
z9CJ&{*gkg-XI7VKd&7idKQ;)d+vm4%wyzuOrNwyE)s3GQm+;fdI{tY13VykD1K(fY
z70-8les&E%U)#kGtZ!~y#n$2+#`=4)!-#qF{0<&ozJa?tSH&Z{D|2%=Gdqv%**Prr
z_u<j@2JLSicUI<b@7yLvyE@QRS%Zbic?>ePTs*&v`|PQONk-=;hcG8x_07o@_56Vw
ztnDxq8j6jA0!*Z*W3H?e_f9Y3@%A<zU%Y@Trx!3#Qivog##v1jq#J4>Oy(&3j~+z4
zx+>yndY+7;c6$%Ni?$c6XABP=0~p9FKwU69Mp!9%8k`KxPD+BY^%x|M3!o$LjuLK+
zuENysLJ4Lk9I!Y<yE}a7AinzY3+&mmhY?twy{HB3glpTLI00k9?(77^6K1gx;;m3N
zu&}X*E<rA(qAnh}{pWoL2)#r2_?RTVph@nRlE-nz>*I{s!j-F}2|YCnTj)8u!N|o6
zMsD8FclClEV~duH2ZW1&YrFeE+tW`xElG#))1?XO`Ui`rD(TVqgnfHP;W02`>@|*z
zgK<=XSSHa)Fo{WqY3xau#WC{6r@<;Q1J+3d-pL$TrscpYGY__mzV>;ga3HiCiYnny
zTn$G7aE!7}jIoZiItK#IzNiXzg_UC23(qYqg;P-(PSEmPiWqB)8F@<@;8D>eMq97i
zX1=}x!NRO9jIR#DOfl)jNUG~YYReE33B6=mURM7IvU&vIt)j4RjFEN*mtOpchXmG*
z*FR(T$v3$D%};nqxV`+-U-073f5x}}{_lADZ~uvFfBY-1{^?(E{lEVQ&iwo*bl-e{
z^36*q+uB9l&Mh=v5vG`bB+jhT|L7ZxzW51K-~58v@BWB7cB&cEiwJD$BE)N9l9~^V
z=yd2O74Z8t!@qeP;XS7jH%jxHxrFr7H;}b<7wIdvku-k=5hDa4VHeZCj>Nt*Xx?}R
zecwDt5p*gRkx(=Wg0hi6WOdviqkRGj`mS6*f#Yhn1fK&ADO%#Vx*ZNFTd)JP6DOaL
z&}OXFHo+k^TAFZr8G-kOw5&KM<9=Cre93n7)iEikvVF(KC&SL(Q9MUnXop5-CXBTf
zFlIYc)-u9ZhZXVh9t8sL*E27K{dr2XJY{@*SOXs))yIcNjq#y`1wNLx#V2x3_>9qZ
zkBSF@>jNpB0BBjnz|1KP4qkb153YcBcpU;`+Yp>OOsnf<(&%RLXoG232Z7fQr}#m!
zP9%=NDRGpB$4VZDL&_xVl4yMdpL6OA-f76qMMq;Dx_a8tJ=h~oD9}g48yFz;Iy%rM
z3`p+nrSS>(4`FC<1Ox0HI~!VPbc7wXd*<{yW)~J2Z)dT7dJfwg%UE9F?YU{3oo6E&
z8N}u0M%-;_!E?rl@7SaM_S|`Vd+scrudm?2+!P)$2H))M!8QJ^=QP9%3$uh?55|Xw
zu(`T{^|@tSzjzgED;t=gF>cJvp|7eMRVm46&C0-1dkYpjT5)B08E57eaP`tv?C#t^
zb8|Pp%NTYq?Gk=lSez4vy|iGVFdwtYDVSuP&d<k2HElK{6U_+;7|P4T1;TW9a1it5
z<!B5KL%M|pB2`roDk}rGy?fwsXg@;aWe~2U0FNU_U{Aofvtf8>Xu(!R4LUNiP-Tpj
zlNIbv0G=d{9T#HjQCi&*965BD^%iz4Vc*>Wf^Pr*eb~2euL!*Td-p<;K-JLHWOOx$
z8LiIP)*1!`u(>Tk$HpeitRl>^V#UU0W#b^8*e%Cc@);wtV0!NzJWS|G;7bW<9FUfS
z1fz-s8?!9qi@c#Jjlc$4HjdD?bA}d6)7}~C4lGAkXwd#N*rN+GZD@M<z5$<)06qbF
zVYI$*M%_q)FA4@>(J&O?7a0#DTAonGF#`BfMDUpte3l6ruu35Gl25@hmBpxQlU)eA
z+)~<JIiXi20?x6d4h}4P!oZFZ)|N(RTTsE}uT{qTOZZ%ZkA~>X*SHqf!@ZQiD{mH|
z=U!D0UtyAhre^WfHIJN9_@1gnTEj3Jrq7~ybRKy_lgRBKN7dwM^qtv7!{iD!@4dp6
zmp|hc;q>5#KjXpozu@ZA*SPZJC0_jUC;a>${}W&T?Z4wLVYfrjo%#M}%zyozc=7Gx
z%?rq0*+PzRlyzYPX^WdkTi8VU!WpD5p5g5c!s`q&mo`v*_B_hAFN#yU3zu_jxpp79
zi)RtrH;0JM8N~E0Bdl`~UUd_&Dd>TDZWruIhvD8Z58t*Gg!gVCb@CeGySI@&cm*kq
z>(Ft@Cb$A1rymG;Jzql47m5b%gq;&_I}>~^1fD&P5O~LwY#^!TAg)WO+dxLg7Dv@h
za74`j`;|1rNGnV}D_m6SD?;xRwxj)$(l{t70}ooir?($0EUjQ{YAhbS5H9a%Y+=cy
zVa`rf6QAr8Vyzth`BQnk`^B3{1>V~yT;frMz|&$G5_)F%=(r`JXGiFrz^9D1U#WRx
zzh(fW3?raqkpL}+3>doR!^EQm#vWy`32cO`u-`6y7B(lRVIDIG({M)H=sq~b4Y2xg
zB4H5r2}7{q^Q;Ly%cOBwB#hGXXn%~p@5K0dqaZI6O`T2X80tpPa3A`GtA%lc@1>db
zGVTry3}R$x7~><O;vnR)!6A%~4v5n@4ULTwe2X}<v4)N11;S~XmPQL>9Nnd*U0d70
z8sBHJv{alV;K|?wUeB)J8IAAe=qPT{7QY^yz@OOby=?Et4kOLgjvkyH9>Ltmus8^M
zmS!jHW1QmiI-1+iT3wC&_-LH;bVk0HD{>i4dwF}hBp)}H=CO5p9wXypSUGzRJ&fZc
z<BY@GjJ1r+)8oTfoF2lZ)oCpDH={8p1*I{eDDd~eNgG>)5g1Xb+Nf}JM{iIR8cukk
zz}S)z*aAs}SupJ^h~V-&b_5}etHA`DC*!H>aY=Y6D8reSWhzI@lahvlloX_d%QXrU
z1qerBg&9(W0~3c1;K06pI6%M&XIbn&xF7qt+{@d7+3or2D;(In4-))aDvC<t&=+Hq
zH;v2G+TyL{2{Wr$!iK=Ju@uS%HjKR{W>)Np^~GNK1L0r>8|$b1Y+p&q;2=Mjgpvv*
z`5EQ)j6~=u@q4M5SwoqnWM;#LW(!3tJE+*5fQo|)RGr+QF2Ii#_$&Cdy$L?X>|epB
z7aRe-5W-IYUql=XX?kJ_@T2XS#1edQ!hXJVf-e)6$=NVJnGK6{+8!gW^{Ham()R2K
zHah}N1Rd>7DC>esSml?q%3z)MrfdpIVN2lImCz7NDn&CqQCbg|vPQVm^gJsY;ak%r
z4sZ@*=bTNrl~1f9yKNE~4P&StIfKD-cX0moFIao@9d^F?BUbLb#Q3?JxbWl$0`HG_
z^80_q!=L_&mw)_MeEp|?#na#a89)5%f8r1S@Bf9Tw7y$^_zN!l_<Jlq{Te;OtSDO-
zk-v5pr&iC3(92xeM%KzXWGtN}?9Sli{5n$R){w-upER>Xds{@p#4Js28i^D0NS#?m
z76DnzSX;C65Un>~qyEZE6mHx_{Mb48w=cn?aSrZH^YCn1ViewhL*5j$f*T-Zn+*k<
zY^YjiLd7(J9a<2i)IG&vA9C8xkkYWl5hcb})^Sw}NT^uihytxn*&2uBOvG!<9#=QR
z5mf^mRMDp4sfxh+oRHf?)BEzc1U_L~+IL(EDr{TPi7BwOv4a`YhY4XPP6baXFd_t%
zwGFW6h`cyS=(}IY<DVIAh4X9Q+o%3!X5@pa`0%hcJ~?Vi@LA#G;{={GfhX^d&y;-d
zl|~>A=!QYcJOT1Hsce7Q>~ssE>RbQ?2imYlCCmvpm)s?|6|BM`Z3>pr{jd|DN85Ac
z@&tir%d$)!Wty0Qb@m*r(x+jUHbc{!#XB}yY6$dkLqSO{n!8)kEnEv@upa}xgdc%7
z*w>Fy0!XZ(A@N+>DMsL#nGuYQ^D#niVPOFqjI}EZGgz7$!^YeUF3`wsZe76b^)0Nl
zcVH+x6H_^-uwC7N+dacrC@#lJX(g`IHREo58y;4-;%<2(uCl>i>+HkDiAgLHTmu9B
zXzlDkZCxF@+uPAtR*8JtSbC5*;_OV3V{48wT2GCm9Tu`PaCvM1y$zL&<E@z6Si#)N
z3Rc(8;PUwkxU@kd9qPr*P$y=`8QmKhcf!07WN!{{T`hR4sKE88B;5BOL%5_8^7PD+
zrD1?%1vTU{3Z|=TA&M}IS5!olqC6sKc|o$W@RgQ<pQ18+)HGpFsOYnyC=rCxG&>>2
z9y&^!J91czuR?S^M7SN?zn9S4OW5tj9xlHU?2ZPxhjzD@bzHcZH{VxNRZX0XKsdkG
z($XBl#Z|2NeT8)^#tB<O&(^{UHs)4b7aF29GzqkQ5>ju5tsFdr&m<(p9{ZqzA|&{J
zlBybz*3yHFt|4UjTv>f%$QqbJ*3caC!bQL>Y{YBgD%m+fnbs$OPmRW>{&(;NK!>&`
zo}5nb=>~;E_bvGJX?q5=J$;se2)-l{e8vQy32o0jJ_F{7nJ`Pqf?3)rSY+hElD21+
zQvxd*pLJfj2s+FB3b8DB+no1X5{g2J@GBtripyYIQV!eFD%h9Sz@e-T&gBh+UL(BN
zhI}d-5ZBs=Izp$9Fe@Dvj%c<aAiV*R`JE`}TEObPAMohU{}=B4?msbo@jhlQ-owVt
z=Xm;uf5)>w{2Lzp_-DNS%YWbx{|E5nzyF{3`Y(jvFMq|YAAX0+G`!VkUt{PRW9-=r
zC|f^^!u4%JZ%eefH;`LL^6VNCXI8~Zm|jNQ^b+gz-{F_Y^@OQK#E#D+dUO_1BXg`p
z#Eq}g7B3+0%q<k3e}t0r&ro>wDNe0FMAq^{B#zvGbHOxDWQ{??s|-5sCD5|ZhOBNl
zlng?lr0-9Ya}<H6sP6~`U0cX#(&|)A+4<>XpOiih$S{UVXtOlgHuM;E8EX}2d2&kP
zK0G1Te$JHetK}VFT5xdpf=^(GI3`q>;6m7^XJq=u^7Ks2Aw^64biXv-|5Ap)lfyrK
zu7uxwrAFWhlL`n!R5WRLdiacn_t`N!d@9LUE9=H+>y0l|1F=sh9EXi!A!V5YCC5`x
zcQ1m5XDO6iOCaq~0D0FkXfe{-WX{5~Y!mLqYp_WjXMz}nH7(Dc@zy1M3a)vl;aa*5
z&&EseY~F=?-37RopM`VI3f?i7mVm9ECZa+EP*PfemgWW;UKe`WJJHj@>h4BAqpo0o
zy#u{m@5RWlF#Y>5<|oIoFg=H*dB)kr)0m%`!oti1&MYnB(#jd!K64&77MC$sUyHu%
zWQ@?NCUY_{pPPsI%sgzfCx6jBf@kdmxKdt^8;u>fJ3N6a<5Soe8^_G(h&ZobNqHH{
zN=wA6ob@oO6i3G(nsyhcr-3*%6*N0Kp_!J}FI;fEtPmYV*_fLd#>&<b*0)x%y}61D
zOLMrkG>bJx+p+2*v}C5BA~piy)@Jb2*TD%w%!vRyLD0C+icT;d2FNQSKuQ*VM<o!a
ztb&sonuwE^N0f{VZz~~;>)wQ#J55SBf7zWdF_V#l275+XMpy}&-C+XnfDl~|9>l=|
z1m3>=tbHQ*4(uo7_I-sf_I%0uN(A42f>9WvA{^0GR#1Y0jxH=s%?LDt&B_9n7N!K?
zn-jV%g&8f__(bTL(fSBHGsasTg6uFq_or;EAF)1X<J~JK#|Eu{!%E6HqO6LeDjGPZ
zrVR;AT}U$ON)hVP`lgT*X1C@0C|cQz;8V7z@iFSE()QF`y~No!v^@QxMcdQz4TiQ~
z2($x2p&b|o9fD7nwx>ti(~XRSUKHU++cO~e3<*9X#$4mLG?*l2z&JS*CbT_M+MX%F
zXPQ|6v#dgx(fZ67MTIhbTLSbf{ti4sv9O%bW0h3ErmTup1H1A%I1zMi1fDm+C!GCJ
zGrx)cts5xnnTK~WjWD7R?g=&UO0GvtQ7`gZ=df|-CroZXz~J&#EMI<#yWjp5FaGc!
zcu3%V`?vpzAOG!t;pLzI6%T&<C*1kr_qh4p&j8XuEx)+--EVQ}^$$3G|1tV@uAp)2
zJSxweLlI3cdr63~>u>FDLj+&aTkyrx=3)uGn5ol<8ec#RV|C2rBBE(}5o5Cm8<|GP
z@Fao;gqlV`-xPehCgIz~^`1q946Gq~>^woai@2e)NFBY1h^7UYM3mEp(xJ~%Fb-l=
zb!W8ofSj%ajoy|^D@dprvDnFJnL%2^oQ9{5gEIQ~;+Q%<Ii!R=QtAYsmU!0A7m~6x
zJUMZep3m48KiYQy`w2W{Z9VvfM8Lw<L4=+#@1voK38Sq!p=Sj(9h#zq0zTR!iGTh~
z3jgghIsCWJmGMtsspFsbY2v*@I{5IIK0c8&VMlC@&m|o2xwJDrmvhJG!sMW8K{%)%
zjbo;XkhV&NiVMw-z*8mA6<vxUbD{{cE~Sw7s)0$$C|oPfz`OAxJG?bm<TCE&&BMKH
z9l=dIh#z^3xTzP2U;ZAkE8io0_9cRb9}$38@XiUwGYfecxLcYcDIyFNd3oq+Y{EcC
z7kXL3e!Z^#9`ub2&;q;B+1-YrzAh|HjAD6w9P^Bhb5rwJUS7e<8sWFPh_#hP>~5UJ
zEjEJf{vmXyr=l?~7$c_=Fkg^?<-&YiY3{&oeFvWL2Yxm@jjNsgxHdG33nOEgYHP;`
z<Ln4cQy4~5US5t<*;%M4$VY2oG4eyg5W%RIWM+bFU48U$qn!5eLp`HftFJdYQ{r)!
z&^x<2gEPz1Ses^~?QFogzE*5DRbV758CAi7C<_cks-pwq2ox`Mby(8OEEz*B7+(zu
zAxm1BhqN^O2*XH1?xdEMXnA3LUX;870%?6tgqZFAgK%MVv?nBWXn2Z;4?~hAMA`j^
z4q_kQXP<B+md`!7hnBa8aO3;$-SY*$WQow*w+DOq{38UEaCtmcTBE+M0W8ccM6d}n
zC=1i|SXl~_8xVMeo^ZO474NrZFKbQcSrb;~gq|*;Cn=|hFApEX$27e!BpGiB>OK4(
z0{r&M$>V^$F!_Kwj;d+lxNt>!#u6z#W5^hpLyop5&zLK3X(vt~sbudA<+tDyVy*x_
zO@dGJE%>wuJ}qGmHiA!^wx=B$4Q(2qPDCtpXneXfKB4poKLZ+{0gcZfF&zfU88AGV
z1;f-+FiOvb5#eV<_!(yv!8p4ZCbU0O0?&-VvtX>X6rfjF@urFhzv2p5(e$hcJ)81c
zI8@YgLl9<DX+lcdFv>=jak9D}PT{$52+fBFf#;FZ1iy?9MCA1#Ccg(6HRC8~p2ozP
zJGk@B-|+I6|A~j+)A0WM@A&yY{x5>>KXCuYUvT@o-{IExzr&4he~at1yvyJHg!Lyc
zFn0YmIxg;_>dZFsme!HAxJDD(6irWnUBdJV;-?6^sU<`cauGDT@UeMBO)iTS$@@ab
z<`B%78^8|VZ*UYo{Uh+`9)w5d5NiZpT@&!`oQ8YLB;1-N;odlbfVOG))DFTfs~*0^
z{4UjFP_hk&w2?O?b)0cT%?grQ*5av#5(J&3mMN5s?4WLTf<SbDBF#@;+ZxA|jR-y!
zd?l%ZgYp{qQc@nDAD6}#?8x8WyANNo^Z$$;=U#UF_O2dq_X%K>wHARVSe`HtS(q<R
z9Hye8g|80F;*)){cwab(u}3)NOdY@3r-k3_*THWN>EZpO2KY$A6rW1ju-;tg=Q9OD
zPg#hzK{#v}11XDSC^}?8%cB_Dgq{kct)dglxfqJB<xupjW*ZuSL+Khjg=_HWxellH
zU3m50WXJL~vR8kH+>KxUe{B6_cwFh7zKc5PZljp0N~)5IRm@CcW{V}umL*xTWoBk(
zW@<w-cPHKH4s|CToFtRXWHOV<WHR`t{ong+xURF$`ShIUw`6CM+2_N1EwQTBTJQbd
z&w2^lKlyLi{LTM?_22yOnEmX(V(q8@6|3L>XUywkC1^D0(auUg*4T)xD^}vvu06_9
z@iz&dtJkmM^3AKba+}fe(rH{ge-aO_U&7nBuH${i-8UXR#gnJ6;?=j`#2X*d_P+iO
zKKS^3eEZ{X;NwTH;mwn0ac9GN+*rK=57v(3-5ne8<)!oZ$&*)Ubg$vZufB!vK6-+8
zX>_mOxrN6!uj2thd+*#OTswXi7mlCCspBVb`0yc|Jaz<^Xnos;N3gA>4SN~G4uq1p
zoUF%Z%ck*KYcC%64&kkJ>+vHR>6foQ#Cta{;FYt7@%YdVeEsx6e0pRTUS2baqy7Cj
zJ2j27(=#|RF^RRUZD_XJ&`@bbN>_$tnF>`by^cyO_qs9d_hDl=gc;h>IKi}>5q6Nr
z`odzQ3kp$B{ZG>poisHAjZVj-bW8~FifDDkv^y1VA~k<rT}Z$c<TJ(=D$n!O{C`@b
z0S5kUyDtDAVHYRBtN8wEq*s<ulWiW;RgAb2ZyRZJbydo$7P7gw?0Z|wmbR{@o~Bn#
z=*3Yf14?<r%CneUUAY3jT)iH-{6Bewb+N?`watw(8lRTnD}M$)Q!J%`&yuNyg%Fix
z!exJ4dm}4LtAMW?Zi%|x1Mv3DfzRJR3P0nnZ)gmDMqU513HS&<-||WLSFD16d>R3o
zUy$Gn(fmTI)*`fe143&yBD|J{LhwZizUZc%h;P}0_||<$()gszLGLiC2)t@qUNxg_
zEsyp4j-h@(fp_SH0=+syuaTvL(Ca&V4l9pd#rWO}XrI`Fn*PlM-$ArbpG5omix}Jg
zkWu#zcAO)8b}{N+d>wBvLjIaD_h-NR@A%EX|L^$c|KtC|&;JMh;&=Z{82&T<g24MR
zf%oNa{|TS}?eFn9P4B~B{S6*5-d_C1H*tu-+y2J8*!23l*!ac=jIMLATlH8>?rlu6
zOt7qY{07EeeuJQUhoE~KqYvN22xILqEpOn#WAxp7sFVQ;K-cxVXuWU~E$42b<?J1_
zpL>A5t2DxUA7J$Q>*zgw4_ybZp?l92EI)h?+C)E$sR8H+JzcOFYHu}<wTQNlqHpys
z49?P~yC-3fw;@v3i%4xZyvb%-A7Q1pqr~K7oE6|1X?W#WT&zM~X&H+6vDNKWVr_j(
zyD~INpeGB~D#pkA&J&6=T{x7Lk#p2G<do7NwIO5+<a7xdUJ{EdWI+KLqEdq#YXfpB
z2|ar|a-H4C_x7VWG>Y=nDpb~QfWLhwqD%K9+Or>lwmtB+u(a~yuA>MKo<m~tHfnaz
z?5_U+eNX-h?N5G*fw%r1Yrg(pvGHsFjg8FTW8%?ou<X%aWBBQ>G4}31VD?-81Dn71
zzhhp%#f(;(OSj#DAy&5O##Zd+&3a+)0bD(O8dtAe#Kp_!aO(6C9NxDBXZG#J({rcs
zjjI>&G0UshuHf<A+j!@VSMcG7Z{gh!-@rQ`yoq<-d<)-T?EUoaL%ebLFy1`08y}xK
zgpZEw$2&)l;QiZo@X@QU;TvzfiMJSIpFF(B=zIfb&z{8j)2DFb<T;$<jeYIxC0sgx
z5$7(R$LZ52@yeasxV~o(_I7k)Pbz~mRrR>r*n#(!j^JT!E3Rhh@N{$x4_8m)>t|2k
zjnf3)@qKuBU>je@PQ1Nu3+}9)!s%60xU^*}&TZU;y~|f%tfm?X8eTokH&dab?Uf;v
zvjoZf9JG|GvC`$l#z>g4)`=;bot9@tFO|QV5R2p&Ak9b{CR9v>o0df*cE?gk;7M$i
zXLTiFcm$rRpa>fNOc~8iZ!*H{u*2c^!xxJq#v7`tv4t(C>}<vswMve2OUTRaQYo5V
zOh&!eS0TZun-b$wHeiu0APK!HiMKJ>d6Sk9dRdGnvf_$>FPl+3M^E?>*0K&-iIukJ
z^rFn|hgSB(4Mbp|C7R+Hm<e4otx-0E`48~55_}zSwskS;_QBId@X`2u1fOqU6kdYQ
zOXKs7jKj;Q>tXSZO~T8_>!tbmXnuae&rkRTrq>}bvmU|OO$gEWB8<9`jl9t}??#+4
zH?f_z_h-;MqJU3<pLAyIA@uehr|l7X3iwXULGQ>JLQm$Ho<zg)ZK&?wgr;SC(7x(8
zhIZV<)R{N2=Hh#J^2I;n<G=i0@yTEQ4}9?BKj7;>{XKsCyZ?co{QbWZc>iDg;eY$T
z2*5w!Cx7=31m7R<<!^q6AN<`v5`O=V@Bj7>gx){l!(aXl9(?fwoRhs|-}^fDzVkI~
zCGgfUzRo^<7c);7V_#-;CFI6gRxrXYBix4Xze?D>jiLK*(d=Hw;DguDfBy-3Za+Zh
zjoU1@(01h-S}$H@xq*&LchGU+9@@^*{w_Yo$j!Gf#?p4=2I_X4Lv(5%O?xZMjpNW~
z26?SUXy`-GCuP;+F0@Q;!qomV7}|IM^&@N0Ff@Z$Qy-En;hGL*K6b9wOye_SiCTv|
zEgMJPGx=pYWHFhP@ZPCzXu{ILVbnG@vq4TPmN$1#)*~Xrq-<W~tBuGiHnE{~AiK<m
zobm|r7-jQK)yT8RyxJxd*jrKPYDc!c3E7SoWV<_&>+3~+U=XV4a_Fm9!``$RfsS2>
zbniu^Yac=#`w(p34`1s6gu725z3eK|EAJq^;Z?L>{63aG{Tnnt{%f?o^0ydx^><i$
z^DohU@^hNtr>H&f5o(TpjMl5)$M`$H!`jdO8|Jm^%h5p#=rowoSLwoJG=mMzoj5Qt
zjgvce;M%!UIDhg8_HJB@HGNC5t+yYq?%IW)Up$4MFe<)rbPt}MI*d>6-N0v$?&9Ou
zA2L$DjyK<8<bCIT{P2@+;=8ZEiEll5i0{At6z^ZZj>qRO;lsDz!Q1b=qih5D=+Q&m
zd;9=*9^b<Sg7o~kv$%WdI<CluL>FnBH*VnKjca)I^;hx5*S?O=pFF|Y@e!=?J8-x<
ziQ8QrcztLP=Tj*hjYe^$s}tK91^4xKVc&{j99chwv%46v_iV!X-5dG7*5km|jo3mk
zP7U^BprH;ep#W+KyefSKYIQo)YBY#1$wGYLB6O%V7<V`@<#J(+F|l773C7q~wt%&4
z2@{NvLB?5^#9CUOMj*$yS|FVhjI~891%-tuEGXdjR7xj>nX%c<sO*#FrIIP7=POZL
z$5PLa>yTuGNH8+Q(=<AQP*y~itsRpzy?AW~sRja&B_W>^gee)b*Vw?Ht3`}ArHN6y
zlr451O)rP9Kc`HK95Fw>ZW+RoM_?D4DxtD5)(H5X+g>;hLo@|LEUjz;V`kL-Q}D_1
z;f@voUk{w^y>NB)!zra_08U!Avws9G#$6ZT=VIJ-kFJD^@N?1p+-yt*e%_Tc@X`4E
zjJko<8xdT)h2YzU@P-|TY}$<&BW|4WHnHO%l8m^iXW)BoeA2<uu=}`DYWZ>f!4r(P
zjJOA7aPuiFJ9Q0%yU!vsvKckQ+t9r7AUfB|+J8^5=K6<Nck64o_r>pNYX1}8{`>zE
zAOGs#@zKwIN0|LPe)B*6PyGIWW2F7Ze<JYyUMYX^_XOhKN%=?o=<ogs-~Zb==)L{Z
zU*h(+zQCCezk$7PyoW7M-^PX~Z(;WFTbL&7rf6|1AH0t7`>$b)pc}dO1VeWpqyILI
z?)IyUV^Us0?`;C_=0mh!xrO$tH)wb_3A-z3IDH<?XD*}V%r!KfxQfnmcQJh9b&TGm
z;hcTM^KccBwZ{=%eFW(>$B>#L6oxm$w{#j&f-S-HQ$4vF?He|uae5ZP?mqZw>GAGC
zgc~{uy`_lOcR=qBBTr{$F|mP^bWnyxgq>JkKJOos&4HeQA+&XMBas%^F?mS5rRgQu
zD0_l&lxQtjl&`~rd;?y{H8R%PmASRq6$unrYf(hNsXSfK_?AKw?1v^IkQ+jAbOgn*
zF{qOh&{eO2rD+TAt6d0m@8|11h;YwggnEv^-+37Ru49M}TtH^|EmV!)N89!fFm&~2
z=zsVdG~E76)ZP3A+OPj5x-b3(TK9d1hRyFHz4j^68=o>Ayo>fT-^AdpA7fs()r9`a
zN(?dXEVH^W?Tcc4x*oF)E!e$coKbfh_N<-8L`N(7BN0r56FAee6hB~0{PCH+c<taO
zJlMMl-?@AifBEDAzI<{I@7%eDJGXA&6*0MY-^bUUJjG|PKE-!s9N+y%_>i&poj2aW
z(^p@|qo+@C^WJTozi}C-_%}|SKZWbG$9q?<<Id&lxJ@Ix^H4S!dKvG%_bxvF;A{Ap
zU)#5|4U=v&c0_$R-&l+L-JQ5ZD4eaY#hyeAqh=#k#ll!c8(P=ZiUX@BaB|mX9Nf7c
zv$8qR*dV$(+fYYqstWm0<#D3gW<@olevMX(T9q2rG`Q;Ae6$m4gN&}r2&o~f6}<#j
zdxZgw1W1Meix74m##sx2Cze-SRH#JSeA-=JQ4vj#(U#wnsbzL+CA@AAf`nZxo=|p;
zlI`;5Hby4Il$%MSt@Mu7D$8xlQJ$T|2=!Q1Mgd){CasiYeGM{A1SP)~;d_Zo+-+)7
zmao#At;i*0rL~q$jcm5uOO%C0bp)Uu`2?RBpRAtoNALw@Gw?a^8EJf`6m5_8{O88k
z2Ak}I+R_bs`%>6DdSUPCgRQ$Cc7o5rxa(xpb+9;xSHMa0n*-k@JdC?u#$AD)Z`C^Z
zXEq`*yZIUPb}G<|ZrO|2w*3nDk~<G6(0gutQfhV|LBp<NXxepLF}?=cUZZqo$Wl{B
z&tv@j9jrQYA9a&^P%F>!Hk`xA{ySK6<sGbl@Ci0Q{tSoS`~qj+`ysA<{8L=}=u15Q
z?$7X(fBm2M-T(1_)8_sKfARZ2;K#rH`)7rg_cy=Aj~Huz{M&!V_X)mFfBic={qm={
z{Pl0*I1O*ttM6bFVYlwlo0xg<238Sv6Zc-j_+0|;&MO$b^8~|$-N3Dv(RcG@EWPm<
zUDqC=^U6bX%X7OMjI}rJqw~@Yv@+eapSwcPokz>5OK3ZF6^%zPqV>deEIof8z2{#>
z&HhV>Y(4?cs=e?`>_(XHHM9BvS~ed=@9vZ6+{TYLZbkLXH0oz(P(8I05jOCVfj-3h
z`cboV7&V=Ph&2fGLW<?(>tsA069J)@N7GxBU#O&oNHUGy!J)bPl=tY|vpg2wr;#L6
zfy0lS;tIT&Rf?}>>F||ogR)TQi$yNX*8~`C6UZ^wp~%^evOqs{kx^8nSJLXHVW?jV
zQ`07xTeiX0u?Mc6gG>&`5MOp4;i1#;_n(Bfm!<zKLZerantmA#8{S0M_Kz@d<hvNX
z##np%Z_sl6S7^BWbF^LjInU3}(0k-_v~T|qb+eCAv*rP6*T0OGov)+s_{W$x;&EeD
zFoYRz05c9B)_FqM6is715XKZM;Hvg!4A7?9d>(W<oEWirFyr&%+W1mjogKj5mEAZo
zJBH6_gTK0Y4d1<R1`im4&mP*3`!{dm>9rfUcl-qI9X^OV`}XmBNAb?Z+jz`!nHTl^
zsk1movpaI?DE1uMg#!n6;mY~bczWwLUcGl8PaZwOoA11Zx8HvkAH4exK6&j;d~o$T
zcD1))TPlXr4fVL*(uP|!kju3VxY*Q;8H)wW39qfO7<Tc(>}zhpg;kTdwRZ;&Y+i%@
z?iSQkr4S7T;bXK65_U0`RHX$enqHCst18u?j`6dRKx<-Yq5ZXs5fgYFw7o{g*&12J
zn1J*!###tDZ84*(K#rxDkQ2*Ov9&Dc&sDNvkrh{Bd`%hNFv_BzEHZ^mW>v^Gc`^=9
zHdB$F+3Hjh)igk*(A-k80I9@Z*&&TBY??n`MZl+JG<+>>knpP_{4%uG1pkKgs8u@M
zC=%Eadf6(Cvc{e)EGod0V{W-Y>7^^Fbo>!~5_6w}PajGC(f9;@veC1prU91PCYbA6
zVQy%Lh48bqbivx*11l|}lHjv3^4b}9Z7hzV<*>6j2tU^-ZEu-?Z<-Z#HM~=6;hWw7
z|C&v-y{!nX-+}PP-2~qr#J221e5-V5(DVd+f9%wF4!xsjrs*~ACH!Qz27%WotG67z
zfRz^?VBPgM&@g)hHB$#MwEq@XUw#W)Uj76--~KN4fA}R1zyCd4{njsV_QUVt_!}SN
z=?{K^@Bh<(#ZUk3zvE~B@?Y=+0o~vJ7C)rvefgW;;(NdTJ0;%!@VEbj@BGd0@%B%C
zg`1y#fs=244f|ew7h4Fs^$%ai8p3XpkQ=}C6w7Zu!RYlz7{2y01_`$Qs}HdB%3XA`
zbX>Yiz};tTy^Ge<H_(3iI=aqYXNA3qK9&L2k8Z-Q`OrC36Ye#8PND16E%cv%fQ}P4
z5ZpxAt=$Ia)JE8rt$}@b8o}{(Xxh9F!;JXDM~<U=*G{BXjWhc8!rj{ecXvC2eLaX!
zT2i$02rbVMt44|0iR@CnVtI>8%8;cZ^khB8!V>7%a1Jh8j;6MD<q?X&PM+l{M>fVj
z-Y1%hN-WGR!&erm@s%YyyqIUf3xwVag-*Pv@?m~y7>ml&1Ye`F;g8xk2yJX7Dl)UM
zHf@KaV?R87#}OPkkNC>#$jsbB-G;|Vt-g=Q)NO>O?jS;VB{si_>fK-Gb^Z)JC%!=c
z*&kxy(l5|=<*(3l_1EaQ_$$W$UlN3Wjb#^pioQdipkv!>=-Bi!IyOE)*QN(d0Iy-*
zL@0#yi6l0JVp!|)VZGCbO+Ffu+lvv#xps#YRYo1E^*VIf>=<Lk7`53j;5J}c#)a-G
zFV=K7;PtJW@YAzr@!b>0@OamDT-dz>mk7Gk8#iL#*fMPG?Zjr9)ltTtOKY~`=$dWV
zI=K#ex9-5H6Gw6E*naHay&0$X?Z7LSFW{ql_werByLfW%4&Hz7ZG7~#_wnTA2Y5t?
zK0b8@dwJmwG&bQ3VRt-PgA=g~&Q>?zECILP<H3~Gj02f8PB1Q>YHGn%zJ{yoX0UOj
z2g#5VKCeqz%TDIU1{^j7EhYpj$`K~;VuV<drAAwZ27ax9hE`1gRuO)+gkmG1T5q->
zV>BY5)1p#<S6BcQi!#nmA}$TE++c*w?u0)OL_9&O<IN&_S}4oAH#8{E;-qsTO#n-e
zY(rfJjci~V({Xe#vS!%OIQjQ0Xq+XBvrw>T2@11wQIcN-8~=Zpjg@p_2sD!fWvZ6o
zBn0IsgGB?8C=6B`3J5;Affp62u!J{rwzfi<rY7(!FqoB24VBF~2R_~dI$uZuUwLQ_
zeEL`ldRm{XzF{Q%Otd~jO%n{YEig8;!PwXiQ%fhzZQZcY`mEgps3iC*`-Ta=Ww0~u
z3jCbI<BYqL1m6_g<E!DBoP~GQ`e(Mc1;KUOX?wc}y}gLC^2WC^;%+1Kb`W}ux(f7m
zJ%iqELT|4?PX;s}V?|)x-FF-vhtFf=EUoUwn`qv8Mp^oO`RPa4c>g2pdh=U2^vRbv
z`}xoD;4goVC%^nhT&Lk({q#$`_S3({`@j4RzW%G<;K%>`?+W;S{x3ZK^S|N8zh|WV
z?LR8d^1e;Odyl}o``Pz#=ABQl|LHr}{_-1GfA3YSChVpNyA`y!F<RWnm6tJe=>hsL
z-b3GoJLtJ^8(kM}qT~Dxw4A=Clr~23PC{?s!ac0K@e~^#zmGk#GV>eXz>e2G#>BO!
zXxx1onXNK6`7+uLUqNEaA-HDN!^z0%WaCjeyb{Kq5g2K1?tv8q$7*!$+(YQ?rs=Ff
zWOM|co*sC+3B3LxM0kBHgq_wARF)fCqA_5xiiW3|gI>0pjbo7-MQj*l?A?l~X*RO*
zh^3bCHpRwS8r2M9{GOG($SKm{r6n3duZ*Twf%ydlUf!JPy~KzsD<dz|CY9M2Ii_ah
zTDwr_9!5!U0@~Di80&Y^@)%!-8C_={FqXcC=Iw8xe%qU<+wnH)_kKu|{}heKzl(;G
z-$&!=FVS%7J7_-rIhy%-`<b7j_wui?^wM9mVfi&CAN~&GH-C=dQ=eh@z*|_c|8=Z7
z@EWG}JW<NL@jwV`X%TA~ho&8jw<P;&OC?s@9T+FvdaXt@8?-bZ73v6qrHn%Z4hQ=E
z4s7Uc#q>ZI*7S7X?#vo|@7M`^arPA6psiiswH1d~O<-qN2evfVV|8r|8yYh>F*txz
z%O|mOY#OVER$$}WS?t}l1-m!SVDGBsxVm92K0I+8U);KdkFQ_HgY#$b+RKd0Z@-4y
zx31yr@uN7rX%jZKbzpN<9rk1zvBek1{zMIq)wf_1O=`kr$Ihw*PPR7E1Z#1wxe+%;
z2bDGSHZ5&Nb<hrn%Z5r?t}OfHq3wB%2Ke~?LK+Pcw6i2FElqgU5MK3q12Qzd2tSVz
zLREyGl$gl~7k|f~DuzaSVhO*pavdrdg>AGfpFc#{rIbOOGF?nr<yhv;DrP5p+{$8$
zvX?*_O?4?WWumB;wfTIsPi+Z&B?a))9z%@RVS>u5(ZN=%fwf2jeO@uj^NV4tFe1(C
zS0~`hF#ggItDc#ow8$P==9@p0U985E5)BqHy1Z1PR*nKbu|0XVC$?8&b3rXJm)5WK
zhoKEbp$o=P9!V%GZ&c9w1b%vgucE2}71d49*R?=T>oX91MuN}O)&p~AA1qR_dIkx;
z5mYkn&Vi3nmlf1CItAB?8Ms!iq3x|xY%j2SGlH`caS6SRdlcx!w(M7+r`VoAkH#m1
zm239Q_1-q_KgJ^gxSs%&0m=tt&h9z%p1g_mFMkco&eOKezk=<rd=p2%{zIJk_D^x+
zFMf@OfAu@O{Hx#N+-E<+mGArnPyXU>@Crfp_Rs$YpZ)DW;*-Dnt+Lp_5B}j_@WVg+
z8_U1r`~3X#-~9`|_N%|cqtAbg3-2=`zw$nIFv@OXjGdv?t)$VdxcV|i3A$n0<{+bJ
z|Jl3fJ#&lJb_1ORT>GgjXgx`I@z}^H-gf*FMreG~w`hsC-c)ujo4xirw%z{_d!K%a
ztvrsMd4SFXm(jZK0;;wgLtth*Y|B@na@jbnJg1hvF&Nv2P~OlBT}>BDCu3tb0{!EN
zkFG*=a033$WpLE@!IbQT%9Tc;#fK$jvhJb*3rpAts>Ss5e}vxRf)W_m5Dt&8L{mox
zs+mZl$pqmiWAf67S7qRj#AtaIyp&afuPjvKtBcF=mBo6zkZV!K=)EYLc&ofvq>U)^
zY_s$=$Tc+~&)SIs#{i1FV^D`zp)|3UuX!8O!eInQFC#g94^=XKOlDE+eh+O&K1Bxs
z*Kz&_=(zkB=)Ca@)L;D)wU@t#np2;n{`mLNcKR>ScIId3x%Mkeyz)CtJp2X5E`Eyf
zV{c;9skd?9!u!~N<}GYL`V{l}Ty9K6qgWMToFxRNtyauhtk^){ts~qf?3GyVv=Mej
z#=KH=R2Z<-T8WJ<jX1S-8izKmC9EcJZS`u}-WI&NXEW|@TZ>Dys(rmZIM~vH1B_j3
z(;;kbNZ}lzcWH74N7rw`s<Cm5FCW45_z*Ua_2DF;^l0sBytiW;zPx?~-@bPPcg`Kf
z?dxap+A9z6<iTxRJaz<Y`}#4FOk*|GdV8uC8)+na8M}`1;;d)nT~BzN9_+%gjz;W=
z1+hIEz`>?k?CWU4RBZyaUK`vl2h4nbX1>=-Mpn08k3dBQVl=rlKTa{WCJ4Y}nHCvB
zFT&5AMTKyb6vI#R^DzE8S<D(0D%c=Ylxtz6Ejqj&ctc?%QnKggvpKP~1Rae`%uaw*
zB~i4#8Z`|v!-DZO6G1xSLds)C#$JIMs}5D=DrD3}$ngKw5PH?6<;avVQX4AKX!9Xs
za>8Cv3f+=?*mVYFS^65bz_PirObe2QMP(s^NIVIP!;Mm-1u90_e4`nQWjI2aP66Ly
znqO{(X>Jt_Sru10HrzgFJOOCDA(Z(e$}&S`JjdGb9Q<^&>he?_beRT})A-73o9DpS
z)D8oqu7Th)(fG`aP?nw{SeMfH`Ut*$f{!NR7+ML(vQ=>W8TdBBzj_NoYj-f_{s;Kx
zZ0~>qJsGfEMbnes+ggH78K<|O&^vextui?I@EMl#=sJD_)A!!T<gNFx_T`V2O}lUY
z#cy%@XTQP2U;Ql}{ng*%%x7QX%%|VSy&wM--ucZR@Bz!We*a(a!+-l<@!jA38@}_~
zf5Er^?w<&{f5jL7`0x16Km02`CJ<ly$uDu8vG(|zU&HQ)Z(-BzSFo0LH%YTwapfVF
zU4Dq+3->WV*!7*h%@{`59leV7qnFWg_yU>^pF;y*XPqphaO51Jcmpf=J7Z_=p<~Bc
zq*m@mcyKeSR_;Z^%wcqFIg61acQAhL5qb_?MSLxz;P?jEM^?hpKLSJ7AkX&zjLicu
z)b&D}?tmuV40WslWr;@Q`%+j^8Nw2C5V__kiYimQ2g1myaAJ|hh{dHcbH{)z+FrIs
zkA-Ydmz0#EfcIOhss_tfO(Vv{B#%>)ytf2+b0Y}Slm;L2O0;-kaWTHSs1z?OslZp4
z81U5`v$E-rtdK078H-B!|8)uEC>#E?pwQNdVpl&(e4|i@R-!aM!`HomiD4HUT}R*>
zI*ah~%ZN|iL}vXX)NX&1joSxkKK2>9Fa8Mq_kM-G*Z&4RulyC3-uWfEF8&NHCx48F
z<6olf@=ujvD$`GYf$1CH#Kv>);MnC4aQ)6FxOL}aT)6fw=Cv^1^$`lAu>i)y9?be&
z*y?p-uh)xxEE@^G%|0LFZwSjO?HHsL3^FpE9vQ;r^)ooKVFt&yti_|ld+_$@19<K1
zKD=^j4<qapj`#H9I&b{Tt)19YSA|`)t}|VIxVd^Gqwg_n-m)2M*R8^)=`oyIGmiHU
z?!u2w9>o{O4&vL_&*0-H*Kp;^A)LE%824qB<XhKq_uvt1XlTJ~IE;<qAT~xqSnc;>
zOD2I+gFQIEVh|T6hjC)03p-n?vBF!45r+jULq3d!0_gR5QNx(-X9c(My;x{LR$T>S
zWI4QqS(wm^)A}NWSg=e3A7idOv$K^H!H}N^eL=qR{LVrcS=1_2R_I`_G{env7Knxs
zrT)v5oSM2u+8s+xJ!5S(WIs^Fz8F0dwY+(1Q%F=r5l{LN4?7U|7?H4*p~_T(+KNIn
zX$#R%l8eT|e6$p+(8L?Dwy+d+D#q(_8``XXMr<!(Y=NCOo}E!O76>SVn8o@eO3!Wn
zUBlOvfR*-E;qsuA1}ZyR71HP>-U{@VlvN;CZ$h5YiULMm0bdD?uf!$$n+Bm~+||(h
zN&^HS;a3_?qAXsGGJ>xxQx9!*BU@_1kEUHg<Ev=ugub;KhW4c}brO8tgRl~QwqAmd
zR$}X4!Ad&`8;$Qd_&ihV;ho-uz?y9c{0aDC1Ybg)?QPqS^fU0uVCD3_!>HoDP<>!-
zkaF|klZ1*u?*cjrnx4}S(0ld~#&5ra6W{p-?)~x~aP5ab$GLBQ2Z!GM0EgfI2JZj#
zZ}9rB{}HeL<!|xUFBw~Z^$+;yuYOO!{TF=x`~QY-{r$h;+rRsFeERqQsw^Az%1{3a
zx4uhb{P3GN`09rW@TRXkz!YIOdF?S)UVnlW*Pal3k1$Nj8#r@Eu}%eghcBV!(0Mc+
zI*Z1Gr;*xz0BuLlV(`p$^d7#5x^)K;8rld)%M`4&<FskU)had;tuv^a+|Rgr9yJ?|
zz%jx&xO63~y~}CTLu_RFP*K-I>+V7Yfv1f(p)^v90&fh3z6f#|Kl7b_6gm9JH@lIi
zck!BAkfpU^A)%MW`)467Z(*^HmZu~1N|CM7!OBLjvwsjxon6W^;#fLK+pANSkIK|l
zBN&fCYqDT*o(eB6Q4xG)cri<lmk2yDJ=yq2mV;j;%Se@lkVWX_8fs8rZbp%<1I6wh
zl=%Bm79B%HY88w%>tJiyu1q@%4ID#s_!JUj=aHGZhN{)Kk)FMW>djBlcHkp)o&PQd
zp8gUeZ~ZM+zx^+mdF7wcf90<jb$@{73tyt=`j=RF_Y17Q_Aw4$d=Iy7d<}2D{7t<3
z^xJsy=wr;QqnWfs{OF1LFcNlSO~8#^K_5=g{LVyU*zIs)m(P!#u{c(_Jy=d_SV6*{
zA0NZnnUz>SvJ~r=EybO^oAK$j)A;!6F}!hfJDzUafE&ZZc*KbFuwyB1_AbTsfdO2T
z&)03i<I9(E<??ylxOx^Bj_k(G-J9`^v&Zq%s~7N{bI0)BrDJ&e{so*rzZa)Y@4>Y*
zhjEvdcX#`4Y_D&|7M}F&$tc#xLzqoPu&J>MCuxl5r<Y?#cN5mur?4UsK&zFIFqNZ~
zU}`j*(8S7K%ZTnPH^E865m?v=JWFXQOoW(~fOF`|U{w=rjIG9^0>;QZl;!0>OW^4U
zI|J>`Q9;`?8xe9lk%)zr=W-bWO~%rtX?4=Auk^kWLUS9Xh}FsFDM?1q5JC|Lf&mMH
ze!?zfLPOMw=AePEtq4`6i)q#xw5oE^khKU+S=ne|lx^e<)>v4ECbbcD8Z&CjDv>U;
zBEr@>WH2M{cB3jDSBx*6k@0(V%4R{ys#-XFVU+QGmlF8KU<f*1Ly5HlJlQ5v_QsV>
z!E+3<DYyegb~h`^9QaDy0jLN+0iTAEw=|SMDQ&MbQH|1M4O{6tXshPnS6)l-)we<4
z)WHhb&A3bB>*$A-;Hwny^)T-CE`x1w9F@Zpit#y@&%nKMEy1?|{+TTV-*y_`PK4L(
zLTuw+BnZAF!Ivf|GS6aeN`|i-IF6cwER4Bz2T!7I|0y&cx`_7Ux6pCoKDtgmz}WRS
zarTS9#^c}oD=vQV6YO~PO^jc-f)ii=Hm-l~=eYZ$U*R&X@5I}mDAU$1fAk&P`__-~
z_<O&=!_R+)$6x*>9{=!Hxc~iM;Kp}<g3I6f0nUE%Id(sJ7pt#5#PYK@vHZdvOb~jj
zZoh`<JFgRdjMY~jF<#O7XnX^wZei((Yv?|91sz8&qJv=WIB*sN{Jdx1S)?bo!__$n
zN7FbQwJT7W8bx`i7xo%japM&1Ez@u>T?a?sI+(g=VD4Fkisn9=br19n-6*ear`0t<
z6RCqLSVfp5QRE4ukk>7j5jEfLL>{mG5+;J&ax-$YW`fSZ=xN}!r<D<SFEKR;^s-gu
zC}!PA);3~bbUEUgYQ)oNCX_nfM;SKMvV>F>?<Yc3rKRB&FxILVYfJH!CFOWA*F@;a
zyjzR1+nKDLmo0N`^=agr>X;_ll#RTLz1=7Y(oAE^V9ZRRvYw4z`%Xlb9zt^P1gb{Q
zpmxPY)K6YT?bLOoCTO{n*O8gMkD48?qUGEt7<}+kZ2j7Q!LG0WuNc4o_vkqHW92zt
z%gJw`@5Bd~IsG0EUHAYuZ+?Q;AAc9`z5WGWedU{&R~3q)j)u}4_M$i7#B9)ugRuy%
z&=gM4@b<AH?W0X?i$^d`7|QH~VV@hj2bN-Ee<#Me>ac2QBW}>xzIWyrzC3;aZ_iHR
z&S)>LFiyV3IQ#zKFg{#1hR;^7!R?MN+@F}lTNlpa<tyj$;PM&VJ$D>0pFe@OuV2Jl
zm(Swfk$t#vXg4k&+J)1E;UU`O;nkBkvT_`I8Ev=JH)1pOdK)3LraFaX$uK4xtFdcr
z5IctZF_uZ8Nrs;o^+>a17|oR<Ew0XBKpi7qi1E(DB2&gpjHwl6vWA^3+F1-0W2Gzx
zAXCMpb3#*GgfiNljz44M-?7^)jH4k`^E}li<ETw1P*cU0kgr9a(N#%rDkFeI0ofs{
znpRg^%N8>2M<is0-&u;7o6*^;K{}*GO|%?M;c~PD%Fy5}LW3m_%@w)m)E1yEKO2pU
z7o#aRA5HlsECNG4nspX*+x)02twbchjPK2eWTh2RHe#~zw=CJ6BJ@&(Z?)_{nyQD5
zt-Xd(UCVQ03WNwhxfxB$#z6wT1&lP=dJA%mm8>Lo<XN34u(?r4_!ZF@N+j|=1D__6
zgeID%@l`>WtVKD&S5E8G%X7Y(W*8*yHnhXk)CF@(56o>#VG;N-j#+z#74TX6rDvDm
z8<~P@Y&E<q*TFwU+ne5s(CY076MWJ2yAj{C7s)OAkz&+MZzJ@!9YWonQ>fl`g4V~f
z_Y^YwP9wAb92$=@PM&^9i@Jx+uYQgTpZ^y3fAMd4<9GiP2S50(vH<Dm`Rh3G@fX<h
z+DF)M=M}8K^%$G)JjIR&uVe3{w{hsz4-^O<edB95@X80+{rDYhW0c+eV6MYr-Cagn
z{`}01moatqKBfr2ncGh>d;blkthq<<U44Y1Gq=!r@I0D#%Yvt}^wk9xLT~q3bZtG2
zo{gtazhW2sEwtYBAneHj*rI(f`q*IjI$@6V!Cti-&ZcQt8dlM!S3*-W0bS!tl+|{j
zjOV&E(+G8<7OH3!ii1hoT?_>>?#{vJY4;$P*D%LyL$<-nYiv^XOIgGi`%*~-P0yqp
z=a-Z#9U1f4sOD?R;o|je?d(C_vpKdhIDT&S1(O3C)KEMDt<i?X1zNn6tzo3qU|v=k
zVW-Dea`?CNX?lf~gs6+L*^gXZjD}f7(`-~MPu9#+g$8*44AW>=qOxWU+)Y~$?%1nr
z_ES4}77Zg8(Yo>mS|)FycKjOBjJxRxLT~yOQfqFZddq#Z9(oHiPkxELpZzab|IWXn
z|LV`sdgR+^+4D8@?0Fw!2j0gz!f*eTZ{Xs?&vEnV_i*>sFEB42iXjt?pg!zFcff_|
zfERli9nbNNo%9EBh~V2F4q;Q!udE$6YOlmH#>w$a5-VHlG2BsumF?BIuzVQb-m?oo
z*tG-iPb|lU=4u?H(Y)T<jraTd@bS<vzPWM|uMH03@#F+<ZQg*hTUfSk#JL??aBlZ@
zoZGVl$N2f~nMrJ1HiYdfmSOv{VQlQ_!rJCWtf{TVEGzLkTFkmc0_&>Mm>_I=Wv>c?
zZ?K^Ty)sqH=|tK}0O<8fk8Fq$G(fOLH5ycxX;DkFi<OqauPKE?t%jL)r&SlDL{*3)
zLa&5ydp>nc7WI@h?Q8^!>|Pck>=LO20Y;dy)Ux%ftER~jc-4F@8J4sFkguhtk)=M3
zM8c1t*95=43=vl;s(g&9o&wZ(a**+4quQ6tMyD7ZQ8ik81*o@Xp~aMgE@J_@3BKmS
zY}Dmtp{^hg^@T;KFVWESbm*@1qC@XSrl<l*H9^Yv8R2`E`MOn%)@gyA+^meaHC4@s
zv0_*q4rmB&t;2<KZxF?d`C<SI3B2dflalkFz$eyMEOFNxf?B{QLlp?VvRImtir}O5
zRitVevFc%{YJyREcI#SUYUm*NSb_fpe3s6EKY~x9u48ZluF+|Dme0bsasvXBn-HDd
zfiR6vN@!L9xC5~byOE^zWwsD}n+~CR`*DKrB&znz0k7`JRWu#HgSHb7F!SJ3?0M%W
zIQr3F(&~PJ-EVvcBj+Dr@Z4=|CG567c@ML<o?_zSU1iVP+3OFnk?`9}sO@<82DaUQ
zjUapl>ux+^guRcIw7Idfw=jJAItEW(!O|n=2(UBgVpNr5|B1_3cHtI{?;)mdKEdkS
zuVTfe`~10!XxVujb(;^OYTX`$R&Ij3e>HraEX}J>nI3^5Ok?x2fso>BMTM&odQXe;
zH6tNtPLD#LUIul11SJU`)5FlD+fd4eMjfw4aU_MJU|fl_`R<?syqroGvds=;^Exh}
z-7VG`u}A@4IW3PxBc`Xvi$z*2QdJ;}Ut7XxTS3d0#c1mqT9jwfa_>AJGb+%N6;-?x
zrBZ_fi}K6yLKY2=z?+vv>m&4}yi`DwB=i;)JCRN3<!EEbA@uSHJ=p_8pjYJYgF3?Z
zo}Prgj*T6oZLISEs{3f{gkIZ<Yv`Q3jgD1!(KK<B;G^-;^lDb!KxWl-M&4UUuDyfK
zQ{TY!tAB^dSN{P^FaHJF4t*P~yS|RD9q(i0z=xPP{vp<${|MW!ehmlje2i;ve2IDC
zU>FHT*{YBaZGIQVeO_!O?Dq2pKk1L)SRjJE9zWJFzxNdtqqkUv6|69$j7ZC9GsA7Q
zm|!{7-HwOLM)BI{2p;rw;$R|(y&*TA5?CJ)bRQ9V?+=gQ4gSpS;UOF!ps{tdV?%2z
zHg|UuY&}@p)rnQ@tr%}=zzAEE@p@WhV<RScvX?VzkMXrmM&gXK8BF;7{5XW=jJQ3q
zAnJo|RI?<#4)`qwge)d_%XM&<Y2c!@xeE#qEGR^#M2#9%8A6PUzCsneC8his9V|4y
zQW~8MEG{AJbXqM8vUq^SN?`fsmUl|aWPjO@qJco6?(^dm<E%1Vgy%wfW2I#HaWo!A
zhz*I)Qwfi)0)8u@WYeI=twocc&~ndboP7x`p#^9R%tu3LA#E=Ut$}Q`x|X2Dx(FRM
z+McZt&H7wa6)!-hcoC|!d8pQvAXQv~M%rE@O|Pn0kCdhiF}~jr<9o>EQ&wB4uBo5v
zKw(Q=Lo2AQZAOR<T$xFaV%n~X6{Ads9MAw>DAp+8697ud)|**jtjMJ${3-a9sOuy6
zXaj0Cn56=~XbReB2IT}_Ipgkg@EOGT2tF}BV{<2rO59xvQ+uBRK1=ryDwmGJ*0=o6
zz~>!jC75K?o!W>H<8GADi?7?MtS+3{ycbopzWVJ)_%Y3I>p|4+J%Q?dr;w)oHPQ5j
zuDp&NZ~q9p-ue+Xz5H!j+1D{}@*$QUzl{w~zlPoKehcH5A7B}y>F~*`N<^K$bWhnW
zZ7rd-o(8v$5L`{zP0B_>XRc%T#1-@rVBLq#DXTOq`*Z9)j^>?*QMY9usy6Os*@HUT
zXyc9}Xx?>9DQ$aBpliQub$JdQdrm4##7CBIf~RXWZ1v-)Nc5vD*oiWK3(7nVD0S8H
zn$)1oU58SZGJi94kq(qax@dA8$PaX&Fw{=7tA{F_R_v~Tc9+jMn@f|=wRw<ZabgKg
zYzZ5$#X1wOE3HmIr<TquDU6{-W&FGX3re(jF<*rxC1uc>?FdBW*<I7z^vCpEgDC^#
zrHJ*(sw(b480rc;=I840RRZrT**eV2CIIscEJi|a&h!L&VtP5+D4~~`gPufNcP~oA
zBPfrrfVFxxEpHPoj}2Nc8?vDjs2e$p=J88tVWe%IxQ52@D`=d!f@Z?5dFBS{XKo|2
z`elTsAE9dZM;N&NOAOulTeO}137QUnj`l;}LhqqZu$(|#bLmrTzVQw0zW)hMzV>ZA
z{_bC4o{xYFg#t)0);2J<4tl*<;|*XJTa9C(6b=Pq*hv7c=Z(LtREH57#wxcDlkpUW
zX*vUqRan*8gxxKTI6?@VVI{lNSdTN!{FtWmN`Eij9~s35Bg^m>!S@QGc!R(@+S!Fo
zwe?s{fUV&Po@T2uL5mxsnT!xFqbwsyiLPm^5XdDn7$@*nvP?x27-j4p_IWTI4PmKl
zzer1sIIRdcEHo+$+-Am5Z7J+(6&!_yaOLJ9oL`7kQ3;X-CGafHg)6%d-XaYGtmMve
zJq)xZ9g8g9>2^2~@OYK|YpeMhWF?bY78w_(j7qO-K%7OHI`*t*E5Vo>OGFSQgu?`I
zz-@)kR*s;x3~5Ih>YRjKWg%+Jix^|)p*{E&bcbF<tN$yg_02<l-~}{$USc$V5zUtQ
zXfiKEi#-pmwjxxQE<sAQ7*$5b-^y}iR2pOowX{3~s?|niW=Mio8#VD9u!T?Xx>Y5s
z(AXfo$84q3wWz6WM3RPH>2aXgU_v3GR_^vI$Cn5_*)>gyKu_Xsmhy~m4t)8%{zZ&S
z#S(KB@P(mbqo@fIe4zx&!bt^uI$Gaz@KsdNSO~rf8P-DZ%{}Mqg0ZzndDb@<cmEuG
z?qxFs-x_#Vi1n>Qkl+i?Y(`}D7M9J3uGxb4>=x8)+M|Fbvw07y<vHIDRv1F1_uPG~
zdHG|kd-xe<ZhalgFT9PuV~@~%_zos-zC##(7t?p&A_TAT>zB}f^b$r-$woo9Fh!f2
zk|WDFLAU($RSX~J&m1|Ao&#sl!5H1L`#73*9zg@))<F2yF?v@K$Z@{D@YDtbC)OiG
zV21_R6FjoSR&7QVqk7|}{b(Ynn>HRq<LqH%R_unqYYj}*Lp+ZyG@xn}In&CJ5|uNB
z5|&b54YZ*~o?`;TTZdd%4FN~H^QTedor|*h&LDE_K4e>HcP2Zs479t7Ip8fS)hobL
zV8_OO5#hFopnHk;{Q`cxu(%Y9_^~|4lf`Fc>ZA1di`k`_7@nJ+95X!1%phgGMqMRd
z%AwuS@V-LeeI=&?FXWr>)jURK0&jkiP1(>}pqEYP<r)}oEsVF$E|hrsP#PG4F1&&k
zI}KOEMg-b-Al`cbRl|(8W2ezPd5PsR8dqK-^a#MoE9hK(6Fuwhpnd&Ap7+;Lwe3AL
zAN>w`ulyXHmw$$alV70a^ye75`a`UJ`RCaG&aZLy8-I&y-}wz5{P6Ga=CA${-};yT
ziFt0HAKri;A;#LY%(~#`V{FMcP?xu|1zFFNxt=Y^cD~V#w7%5@-mHL!CNYpmp|^$+
zv$YXBnwqeWC+R3p&IQ`rrQR-F>+i;$?oNWQpJf0~1_p7nryExpcMmi+V@swMYZDo)
z<=<LURficuaDpw$2><RF?PQP<yDt>Ma4dl_LUxp9G3*av$QQtHP<mqB=neU4dLh(!
z2`MY#WvPIZN4udMm9#utaWR~Poj11t;eujB3X0&(%7uf_voFbqt56LewcpJO?lfB9
zbGQ-Z|4j0KRuQ&!ywI{Tv8-YutDRI;^G1+?nssaq2@zRVM>-AS@hH!s2VO$gX(@-(
zREm(joKe<DlQR%(T6B0-==GLhAXtcjST_113(@0y0iE8jpd-8hJ&9~A4J|>t?IpBX
z7NB1DA{x!P=pZl~bVangJXBSbp~YrHeVKvK)1g{rL4)3bDndKLxE`x8BgNPow+CqN
z2{hH!p`ON^N>?kbw`|B9N`zr>x=_OCqa(=VxP*<_5}KY=4tee)C7Y2bhsGz*`0`~Z
zEUOE}cE((Suh_$A%&05iQxmWn-Y@?dd@`&>z*j-T6!29vv=V$Wp05k0|3BcfGwM2r
z2)@xNc*bVnTd@X#m9qrNT0%sYq}qfefhP-)CRT4m<F>uZJ{y7Q^@yz9iq2yfvHa>2
zEW7j+!)Ko|dfr3Z?we@Xc9}8r0cP)ijHx^Cq4yMH(ZTb|67xMYuz{l&G0MoQl;f8%
zLKqIv==x*{`h7II-N(?n^C<rxtxn965j(wRn^ID%w<1X!jjf`s5O@K?+D~ZvmeE>A
zrr;PDhn+yT_b!L4e;k34RR}Lzi`v!uQ8l?2p~3YqH4LMeP|I<pkmrh{NcM|$$DsDc
zp$k>P7^{b#MeR!?-w{KeJ%U1K7zK_X@@;;aIPK2FC~2^<p^|5JjI(7j97THP*^sHE
zXSPf!iwHRxmXgKm{8D}i7PGM{W20vC1qrh_qHN%kyzj;Ao<mQHd@P-F@_a88PZN4J
zLQhPOg~0ntt^qI7@LtHb5PCMH_jZxWjjYlja&$2TdIi=d6uY{h^3rboqcB7!V6R?>
zK<joS`WS6TkE3Db88l9wN5|UhXrH}~4wlX}H_)^87Wy~b$IzyS=-v7R&AZ=3+lf!m
zd-;1<diDEgyZ9Y6o&OZQw?4<zYd^*D@BINU|K0x!AN~9P8z21Pf5AI{|4(@T_kX~r
z|NXyVo|87?q}_Q4ys+1UT1L4}ZxDlQ%~!ZXSm6p_f;abS+SN=fj%k)z0$`$=5tQ+%
zub~#BjdfU0>)AsP9BF95@s?H`Y;M5G&Nf_VYsEvtPRiZBew<`9Jl)ZOJ+%$kNE4YM
zOea~y<W})(lMRg+<q2L!uneZs=;i-g8i}EYO5IHZm4!OH`5Ko7{b;4Jw0fKbiU(DW
zN(4+5a2Ry3mY2aw%d6C=;ba_l<rTq`TZjNH&np}4WM#AD!On~B)#@3A?TGU4#u=Y8
zyy*3O9aX&0H3B<=c<z~oXuiy(s7HNMqcYGqBZGk>0R+5uc<lywDs}K!O5ri75O-Ig
zHfTm$%!am*0ga9#)R`8enU>dKUyPw>5&9!}SQ^Mew{H=;f{W1~%O&9RFdX97ol6*N
z8GnuZzFmWMlZJpRKt`h?<oNeW4X7gYY8Y?ZD*b4*+7PGhg;f;@0DeG$ziS9Siw|j6
z2-WepvchteEIUOjt*T=LuT8-j3_)Azgi3~~h}p@;;H&@(SQPNl__F>4e0c<40S!{X
zSHuWaB*P$hkCX`bXnjhr?lbU}MbZlR<T;;CIykBtpsQ&_c|GCB`$NEI_+#9i>)n;8
zYwKO6jN@|-tyJLWS+)wnm9iG!T12MSJzI}&9f7w2nKheGy_R-0xkj1Vmf5%yqvvj6
z^4b#&ow|pn9T#a@`w5|asNQ&%#`gwh?tFxfBe&78=bX~HAlBBt_ax)zX~xg9=sUzh
z(Dm~39{IX_f7em8Y(GS+JAm3vjIkSNm$bIbY66c27pKWZR;)#EYz_Pbom)C)`WV}~
zhGFX&g|lxA?tWUV*lgzztnCA+?BMr0`5t>#!PPSfOUp2{RUJ^p8CfIMP&2mbV%4aK
zSHqfVfTy7YuDUjuVzp4aqbMZk@~vLvSv<%!x!5Q<kgcyoHi5T9E1eYv1$Gi+#p>o@
zN0Vn+Sg67RHvTd@N2Vs0lpA1ny5VI*A7WjT6;$M2&#)m+Gi}L|@bdYeLN82FQCBz_
zZ)tbg1YWkD&@<wN0xQ0nFN2Z^JzCx(wTJf*p;!K=(CZ}3`=Af;93|MWHPGC<Y0JY$
zP`msDYFC_OIfcfl3yiXt(YEFq0Z7AJe-{IrU&iS6S248rZS<f141-s{#L%@LV(FzX
z&~fp*=)C$}^xykF#$Ww04t?<lJpAqdh4=pH|Hi|g{Q;-G_9YI#_br_H&W{N_zG1t|
z1v}%SOv?%rf++%^LB{FO>;_^<#khK-5%ffY7)Zx4)=-Dxng$FIa$T(ay>)~jEn*`<
zxP!ph)zE<LjJUf9!;8I3@nG2)ZVwLOVpkWA5_-o8y?t%%*uY9SMXOtpNMej9cx7WF
zR<^ca8KE*rLm6N!>t)>SV8w0r&uzHV$k*HC_MpaDi4=`adS+#Hl(fl&h+Yp*Iibf$
zYg23BVpR20>3y_0w>-n63AxI(%7Eg^QhwcJL7V{1L}RGt|Ciw<GQUvZkdj_2{{Oi}
z0A!^Twt98E8CcR)38dq0#A$GTdkI`dMrA`bVV6VO%twe3Hek+2)K-j?Q;jO83e~nk
z)LQcxO>@y~%V&fwM6<IPZN3t;`wGzRT7nMe0`vuzU@TFHLBg=txe!Y&3(?P??{}%u
zW-3ITItTTtLNxM5t|s)V)MnHg9H_TeDubxwDm}u52BdT@1$+rwaXOJudT^DMT56Jf
z4{3yXb321ks3gMj9A&E{=BSj)AvUl?9&8hQvUQ|@FPCvCpC(aYa-hJ%sB810$ib+~
zMp5MsLgfk3MCQP!#9e}~ES7=x8TjUgv=Dy0X9Ro|jsG#M#n>(bnddq;Y<&VgTG8+X
zyo8>A#WVurGs-cza)#iWMS^CRq4fnuCgJQKLwI~OTDKoW?}4-E+<hALn~oAdTVd>6
zjmo}_1mI;%Tzwlu=boTu=Q(7y9!JgA!>HeONO}I&BG2Af+NJRGw(W-*Mfag$BTaJM
zE=JQGiq*xZ8Lt^zBQ(0`_&ORJK{qr7?*IWku##{iuzSZ~?-_xkdqe@8xwRLj<{lUt
zyP&Tp><KVqQx8h3I-#qj>9!5S+rJWlLH<79zqOebT;BpK&ykhqO7<#=boL_F*#~z`
zGs^uj6jc&-yyiK2J8~*$cRCAUH)nT?6xcm`c1Pezd|g<i<~3CldOXTCX5+u4uozmC
z85SD9&E-ZQ!gQ0#D5*upc*^{6S;J8oU&?zw#rh%8lav!n*P>MK#=<-!Ud++c^bCZa
ziO{S3e?V_ZX@K{SJkv`P()DaC+Zk_{()6SQXC-WDT5;26ggbX2+Or#xrF#(S*@Fnp
zFFt$}DMsC@@e8P(yo$DU4=}jvEi60s876Q33M+2?8bcR<j-FFLLeIG$qx1Za(0=g;
zSbF`(SoP$uvHk7eVejjIi}lyOkC8*~qI=J47&`NHLeJ)e-Qhyzvp4T{d*Ne=di;pV
zCO3pa4WndD(1+THA9aZ!YU3farK-?K;@4GW&`#)$RM%l;ss^((qb-cL+dA86glU{%
zv^~GH7l#>tck)E;udm0^?k;TS&rH)mR-_3%9>?nWHwe>y9=mI*(8aGw>EPdQj>S;N
z*qw1WS&=IdGnx=GmLtHZ7u9Kz(3P`P@Z$=22s2Yb0qiAeW!hGliXR{#T)g1k3Vz>i
zfvwVnpwG()D=QOclyP)2gF%_T#V9C{OG*a?i}JjVuP0qA-S&)z(J;JT8@!cj0<Hjl
zOD=-e9E7ad2%49`XIKPx`2x6f3*e`v#q5PhJ4<MyMFd{~>g@Swuos|-uxug>Tm7Y2
z8r5Pbtin(rA0y#>jHik*8qUQse-1|MOIY}EKtuZ`__SGQEXqO)Tj`ck6HU*8SV=iT
zB`U-S`8u5)9hL}UMOLKr9weM|&n0C+Qkfv0$ix-ft75|x=jS@v3qf6me2tdyD@B1;
zmKM@tiHe2b%NE-s_;Pe6<k2JqeECK@TWz-jzGC~IgHPj&(E8*#-<<JD&#pEp1DR_G
zK3X3e%ktVk0UwP|W^fog`jkP;mAxa%%ndi8C&e{14)5qBqO`rHt$Q$X;yhMgy@QQ6
z9%J^>eaxJ{jSbgc#^#$(F>&%H>Sy=CxpW%3hGm55B!71sx)0ny|A~93+HwM^%}14a
ztmzGVQBAPbY~GK$&GLNiAR0F9S3p;{emAPtY)6&Y-1HVCrZyoyIS0ES<7;qiH9{kV
zFoEYM?7TEP_fo=I?5=|**VYFsVP|Zh&C0MHg3in$&o_0MdX%Q>p-DAEm1yGkyAdPY
z>lxS6<J0i>4a42ltE^cl>leD3+m$UjlRX3Q)we>&h9%D+u%p?Pi`hMovIYfs3whlZ
zl$2tzm^<(L1-$PUGVLrB(6LTr$-)yVH8d0ii`}lsjz7diAhU%Cw`4M@Ku-if0Uzsz
zZ2psEBPq}Hs_Uf_uL)%aFBa#U@DgLKK<}%0W(9g*A@p8&20a;^yqM5iqK(i(Q^+;f
zD(mUVz~oZj5OhIaquAW8XTIi5&%n1=Sv=I!z7w9Vz3}uL;<;rbHguL|eH&eS-oxnW
zFED-QZ!mu2*BChUWAq&PKDv+p0If&AK<(jgq50(J=swG@9sMTSx4nnfb#I_~_I2JH
zZ=hk@2bkyJO>eW>VX1V$O8D6bIy+5ImMiypd<YRXk$@jjLN6BaAsVAS5PUVs6lw@Q
znS3m(f^@Ql7|hgQ8R55_t-(}tBW7rUn+UlbJW)GY$#&7?_7ipoySuTDHn)=2G|Y%N
zK$r~C3I`d9yBU{Tt21a3*b#^g1XE2ghz#v6S!t)unGn#H!&|C>M^%C#D|@0;hZJLK
zn2>XoXkcM0W+^CwpI=LvY>4SC1f&5$TO}gV0Ax3IWjYOCyG*Z9j)dCt$@&TDlwgrT
zn6i<v%xU#8{s$PNorI>{OyDxg`b{NplrDxxw*)rT3-FaMguh}DBGz2OrU+F&f+=A_
zYr=^3xE@VGHR{}XXbTiEmgl33$F6V*I{o?Rb!B6~m4zW+4u<`VxxPHcel^BDg_!WE
zu$<6a9xOw*J`3&YEDRWo=;4juQsF?7&`XwTQLWP}9UH9`epIO(NNe3lSUre&gGewc
zrD<BSL3pYvsZ1EJs&7P;*RX>3Ko-xLlw5)@o0TR{Q;uATxU5JD{8*`EP^*A1mr*KT
z_ENC8P)O@2{C`*<<E}prO)!DdaB9x@2tFMRp@PO&!AheTAE8&lO0DHlNBGIKH5;KP
zGdCpa%IuBc@)<O4-h-8Au3^uUw{hgHkFe+Q+gN|)3C54!!16;^F?0GZcHem$+pfJv
z8#qW?T1KlH;oo0|@QUqd*>MK7TaF>Vb}!=V_aer4nwZ^z<eD8w&F)l6TFUC}NHVet
z<PwBjY~?efTRw{rK^GXEhHsc=M=SI6E@y>Y23HrYu44%HwtiH$EQPhHn}F-2t+gw|
zc4UsSJloVJYET-hf;yT)2~D^tkVK(B2@PYcqqPU=u}Rv@8pMW|!`a>gTXP32buF+r
zw8F!4;~+3gz0tWSTVDAeW~abThN4t3(#kVg+MRS*uq<R<SV+(<Cgc{C5PB*V^0Zp0
z3<elk7d(E}>11MVM>F2z86HIdWTl@p>z;fpD{~9@!~oM(b2EIz^z@bha!V=|=)IUH
z9T;YOMOIfSwByAh2j&+$2|n7OT4rQ~A=A*Z4b{kF#4WJ56MRc~j|@Q{TY<{VG(7d|
z5NO$oNC$1NYd`!Q`{3(7%=mu{ww@Dk3|~ZS_7U0+eTtDwKf>q*M%z>0MK3>Z-~ADq
zcD|2>-5;R-z}L`l@N1~s{Vu9EzJj_L{?3|L(Yodh)J?yJ>UD2po=tC7rej$x0zM}!
z4iBuf0~^8TWD)QMXnP@^pb)_qjPe)@AkN4s^p6t^(sS8D_3dwH!a!pa`WO-W36o*A
z7!%au87Zt}GpUp^#%=?@KhED7s!E|Zoj^|}iJn^8U0t;Tyas-)HW5QrID~}9!<b0e
zF^YyNj0n<10)(3_Cl9vl9C&gI5a;I^zK<Bc@08xxf<ja>%DQQOVWSmEyBoE^2pZ+0
z)}#@o?Ii2O=H>tw=LHvd%R&IMAV3Y_SH;ic@tCq2i`Qv`Ri>L6%3;%$!lqTjsa3(N
zQ_(E*5iF<imD1`A`AC^*bBwgrVsd^RI--P7P>arxmPV&TcR+>ykQ%cMcC4$nV|BF^
z({UrFgL+K+^_U6pW4{q=f>vyZI<UrP#)LzQai<om{02<$xXh$MZz&B?YeKWegj9Yh
zGK5uaSp_4n0d++-v}l8<FAos<w03&{QGbf9_}uf!Y5}|w<!OJsiCv7qYM!gC{36WH
z&cjRD`B+@2#**SP$OCU#b|{zsD~~ZMpP<dt+mJ)!%QZP^d>$Gff#)FboIz;ZVWo1G
z3iO_VPebd|MCRb9OV%jxlc=k$V_ajUE^ngowGtMc1V_&hD!Year#ZQ4QSI9gVf+2p
zvG2)ym^gI}b&Q$8;TbqPC*kg#fv0m7tzZ(Z)BCXI_#KQgzNSYvA~Cv|*0mC`@vUgx
zaSDkwyWyW8$kuLGdV`~ks?w_}B~F`*Pw_apNdcWaqm$?w6q_4n#2%c4kFUi;qjT|f
zIT>9YjIIuX&eqb4O2$`fLl+@MtE*{+v8oCBbUn(GG-;Y{Dc`#$oPa7AL$N=MB3~HA
z-U#xY0cA{`eBD&t2v<uNoGiAccIat@a=zr;l?795NEi`KZY<JOVo{m-8SLhUqReNE
zozECMzgWYfR*I~qvzS)5NY+!-XcX8L)9$o{o!RYzKO9CZ5m$gFV3Rr9RTKd!8JS%q
zg<q5J2^0l-sSHh!CB|!Laq#u4D)EA>pO+;a83dmaZS6EZ+kZe$797w7u~-{Jw!R9v
zmImb8+ED23Mv1QvWuXxm6XUSc_PmXo5o+5-<2!(0_Yrt|j>FY=8m{3>2&}q~^!9hr
zc;s8?JVlc`^s%zzS<9B!(X{C`v}}JHt$W@_%l`M#wD(=qY<Y^B*@tMHeu%c!PYAx(
z&@l5VstLb&7M+0=n%Ul9g~j{~e0IiNg2Kt-vwILE2t!Vf^1M%ad4nN8OBj(v5@A|e
zn9z_R9bL^W=wys-=80<IiRvQc2Kh6~SVjoRk!S?VBT=k~#nCU%2;)(75pJEdsrGad
zEvY2xc_C`}y`;24yg1>?N(5*%zOr(7Xds><#?%58Tv_?><`f`Qs6w2TJz*de4JKuJ
zm`Phk8#5wAVAk+|)${+?C-@o(Y1u(cpr)7{E5A$|ld)`d(n(Q881Qw*80S4U8|+4&
zn{q~D!p>2qhDWR6>yX}F4T6M3jQ=mr$egUuGFleUwg|n_T+|SPbu2A3!Oltzx++!Z
zv#T-Y)nSy8cBQ8btK2%Qc2;1vQjgXAKhwMh)4ZXlHFivu*sx0D!mJ^HS!)nej5AZT
z)mfg4X^R0v>S8RVmG!GD&_v)>Wf!BCH+mDzthvOB>TEN*bWt?xBB(0odF0=T+LK6z
z>QJ3-nA>-Q?^lN2NMuQ*tCZ#C^JxYzF3QFWi*xY8l03YaU5Eud*NX^08L=;u(sQ*I
z<k2*8_?qM)dV$qT+w&=<h^2&pEpY@1z8J~~y|O@pm6jEk#;2p{=m`R)LT74G%E+yi
zsH?2d(hWlkV=iN%xs5>S8AFPY>fd`B!~4%6GqD+_#(w@@2P$g%P+mO(ef4reW*I6H
zeX!S#pn7m4S|)a(V`e`#oP8PnJ5Cdldl^x8AUMuwI<XF+srB%!m_cZK?H@~+kPFe~
zg3D-g!;F}NjI9E?en#uw6^hZhyGIFH!kDnL@ikhTx(K>XSnI{e+F`D4fhj}SCF`J1
z5bAut@_bY6kD|m&$a#V&a{Ez0p(tQ|DrCKq@sHU|A~J|L&(8BmOP1+lMFe7@JEROl
zS;BL?NN>jitraiH3<{MI3)DtsoZU+TI!0Il-b<_(^NUoBv%Fq}od9n!EiZ?Se+lc4
z-r<0aiOtJ~JVc9=grh`Sd90%pHu@BgDj9D|;K|3*ILh}FD6(z{^y0}3!qGG;oqUZN
z>A@{mIy1hSWBg;p75K@@$#dZI5X=GEUKCm7DdZUIkW1UkcXptF;8Xc!QPDAF=!&y$
z6THnk;cMHEAb}U?KZn4wtF-dRsN3@qnvZ@Pb%b8+zOSQ>psSgAfXviA)UJ7qwry`K
z`{1?_gpJz>w^@FD`ZgL@-9_8XL$t4Xg61^@AptngK$w`d2AIo@FtcKr7<J5yyH*y5
z#Q~qiiI7#`<3@-O3=kx;7^cr3P#%8zlW_!CQUr>CuSQN10alkzqnXg_Nv6@yDBMep
z?j=wL2|X!Yp%B_>a%}`$8!to?OCA4SN}@FXho8{#S*`Gy%nJ1Uvb2&6BqroMc_r}Y
z79&zriZ~-|tipt#(FCu>0-wvF7-^cXp-N_F@PbwIwbZ0CsAc6AuvHU$vJ#1ON+@<m
z>#2$}^7(`CRodWSi|91yQAx--$~6d>%HikHt1g7MI1k>Ud;+)_J~iXDst}QqJS0kT
zP@~C1wJHnMMT<~Vun_e{i_xmerjZq*Ut54-9goFJuv}Y2h-tB=LXQolCTw7fxwgcK
zwZ%ScDhgq9VHlfBqS#m(#Vl=Ywa$fAC3<Wk*f+?E&;;>tVG%~y!nbD?qa{a$HUg_n
zMUyObqA|~o*5UvfS*l9BjQ@UukI-|~A|0XarDZ>jhCklS)r4R;mOzDx(UlcvNq#YH
zuRwv`i#bJ%FC{E0yqM3pR9sGrFd;|JifU%%t@P9M0w}JWD{2Q#&lRQhg`svul-@=S
z0jTlCP)Y#mp7rdOGv?}OdwSZQu~BAi5PJ1pJodugF@nU{TBMe5Ky+vgOmzb&3^YI;
zX@Q}-7p7Xq$(9+$#|hZ#Ct$5&oQ-rtAL@XI@M{{|hJ82Q#^IMg!pMP(jH8UDqpJ~_
zTo3o?gz}8gKT3lenJYfAxgj10CTVlhp|Bj@r7H+J!jAFP)iJ18ot?0&Y!ujaz+Br3
zGb5*|x(P-$BnHB+B2J49r=b<QCFs<ikP`Weoqh#!GQc~((nZ+GxvOMU<@q*NDzhdQ
z>#bN+VZnlO3+B_P7nEDEkY#?E1uqhEFQ^Oz9-}1hgBN)XUMv)=<MmKt>|AHXLSDPY
zybqMkph{_WyuZqMKihnM_$d=nHuMSJ^J%6TG5zPy>*j1vo@<No$>)S3>yCW>M<_}s
z22(*YRfPx*)MWLcxRf!Q$znl1U!T&M!DBunF2N_mR^~J2E-0q$l`!I#K8v{eYUG$3
zkX_k~Jc3VIbRf*@mzaXNY8`A1+Zp%w(eh3pJbDEgM%re^*3P5fLhG?_A-(5Agx5Vq
zeBuu8ksC-XzokI0bLV^L=J#5+zJdC6FQa<uChAxkr*5Hn`VQJ=Uq<V?$7opl81r<c
zWzd)EXh#(=>hv%f%rG%ZT37)j_IizW1dId;;TPZq^U?S`G(ImQub1%ihJy&iqKHOg
zbA=~2PIF7}Se;0qnWa6MM61{w;n)+6qK9y63kJ~~2%<3%Km!3-%aWqq1)UDK2|Js?
z02?jJ!Qw6_@U%2Nbs52@fu9%K%ZeCg^okiRi15M&_<O;Ck1bJ*Age}g17jR7u&i6B
zEVabSEo1FuQ|ubHSaKmH(n?I1`0lY(!X<mn@<wr%mlO6z{%=CoPzs+>19xcw>;;Pm
zy(|RPIS7>Ts9l1nb`cU~3(#oDMwhh!J(dD=XtU6x%g2yjjb(;1Oc*OL!FasZY{C@J
z_beNjb%e$Sof}(J9&9P{U}J%YunS<1DvCp8NgOd|u-_QNcE-~KG=QB(BM#V%*kRFN
zvq_Dqk|K;Vf({URBSqyHAOJh^)aWXzL`$w6jahEgWqD|QZlv@5NT_*DjJ#>RjI>cg
zPlm8Oi#D=ky$nV02E!=lbuZNDkk67sh)Tp2(_2Whm|w_`OBjPo%~)7!!Qyh6)9Iw~
z$&UD})Mi!$f=^`+DMdpY(6|UY4-3Jk^%HuuK5dBBC%w8Uu|5KWmZYz0Wod)8u@BzP
z<)~i13C^ZbM#DC!1I>&<{C=Fksp^M?mS=5T1xv#;><v>eC5KTFATUFWn2|2Hss_<D
zy&u~yJ;l_CTZk=R53fLtfOGVZ5p<IT;W#`4EVMaK-#FYfI)R<5YZ)AbuCrqp&bC3q
zwU2SL2R4CSJ&o=e=nRanVy!Z)L??5QWf2B{jGzn8J=-McWK5$B?<t@(NXcg%$~D=M
zN4v{qJ<HLXltJK&DIE*T=E@?i%!Rfnb7B|BT6Y?wGF?pO$1YIGJXxIrJY`Vvv!e9Q
z5_r-BFWY{~rrbt{3l28o!FU33Y2;~j$|AEOB{H}^Bfy*MsQA-}`y74(y}7Ub$1}aT
zF?+R0rfXrbF_xEEkyWV2g1ic4-tAYVH}_A*>dh}C^om?_5mz0;5^WsW6;-@v>X2h;
zqVW=X-ae><!%&4+pe(f-#)fUMbsl6xbRLQ6yJ*_}7TWiHh~_=-A+z~a1gG!8Gj<iB
z!Apn^UPf~KE*dw!j?P^lpnV79@A_9zyNZ@Kc@y<4jRL;uTWF&BwQL|jHxPPiwHjKg
zuwEnZ)1rd0twL{rk>+GEni=f~K7$QjUMv^QPkMUoE;l?<oGy5LUIaox#@-MO&<|fE
zj4%Ng<S|C0t74J3TFoe1mx!W;AGZcW1YQUYw8a{NGsBYbN`!X8Mc`RzStiC*J!7g~
zqlQtfQg#_L7Zt-+Tmmm&YoN3o5t>houOUJ3CQ`J`8eXVp)4`PO5vAp-mmXOel*=L)
zO*&s_pJ9U9Pt$a{Trd-`Cc?<d;*=g<lb*oUz(ON)8P#y=i{RAe!l7D>NJTyprXqxl
zIY`<I(BM^}!I_UPUl9iUC0G_zW5`vAcI|u&5rD&ne2nS~u)LxGTYWS&TRAq`8D|YT
z%$khY$`*eQW6nO7gRB@Q96p@&262`R!U?|zM*?=7&-igBYQu?u9=j~L*j%1X3oOAj
zqvuq98D`WbEGLw@78j#+Nd+1fn9-W;Lvt2wZ;20eIU!^Ul1OQJvpedM4ik7O8lC`8
z&e5}q5Km`d_xPZxFhZp>qL7s^uT+nMax)4m2t&OM#YQL8Y~>5gjKfyOHA1d{$9%#$
z*Wl(&&B(@5XrTdG1%P3~FRFk~=TFkcGSG*sP{F8M!5C=DG_r+nVQE8UO&7eaBZ&19
zP%X>(Go2`L)9^e^C~@-W;?KsG!c{*GTk~qzT4v#HUITMt7<z$Tpc4jP3o2NgHAARf
zwi%u44<WsBGh9o@8JU+Wkn;{adp0NI=%g^ZIy$98g7KB0v$Zh7Hun>DG`of_R7!-c
zq1nlt;#3{2t`_CeIU~<0Lvdw{qV!xVR>w#xLrP>kV<90YV;l2~Hl^gUE-7=HE6jwQ
zO!w0(i#;qVt5BZLNr!|i^R$SNTR@vzM9@iDP^2Y5Gz5uit`w9g<L#s@BJh?JtC7b>
zN<)yEDJ4#S2*FrV*;Gna+L1<0uJN3)&6S#GUw{5R5rZVHPwI_yc1Z$~F{$#5Tfiqn
zR_3OqNj<7#olio?1~03id~Rk2WA2OjCIx!(d~be%4GRhx%ZnIsiy3iC7;!aGf-lW;
zP)pNg#I?04^Kc8j1IQ07LrHQ5`j%a&>^p`SqiGcZ)wrGJw&hhMX6_&`b_t%LGjR8v
zgtv<(*nb{r#@nW?Z=sc@SHJ0XR8K!Z)ymtbpSZ<B@DX~ot7v_z@1kMdLo}@?^oms_
z$_`yxDFmQwOITK}g;qZYKQk*pCE;SFf}1QhR5%<k(yE*`JG?9cKA)R$)J6M~Uf*B<
z?nnq8f-p#Xi?Z@YX>j3?^!^5rp^a5BzDjRwn&6BRbYWS!xRQ296EYh3e`!TpMmV*s
zdZH>qnWh-!ni3dG)wC}i0v0pUGK3`*MoOYJVV7=T#ch;l6og)i@tQ50bh$?vbEDMt
zI4`Vhx)X`UX{dfUd>&XBg=OJSWdTsW@5*u=oU}40<E+hC%J-~-+fEQ#is3R9FzRL_
zY$;?Zg1sV#QQUx~^*(f@%xI66qCHT6fp`gq<BYMvYz&98G2mT*mC+omPUhoaw*`Be
zbvWEbV{2ApcY7(0EY;(}pcS_UD{;5af_wcIJmAN-`^>o1s>8YZQk=+?;C!P2$CDaD
zuo(MoMc7fU##-L|8;UBhzSxL~{89|%Xwb1(kLE>{=+2LzGbf6UtT0-#W2nvLO<Ph;
zGp<3*O~VZ*cz)#BjTkRmcR63I071rFv%`(zGCgu>dRa8cT*6PFS7fk5Wp<;KaaI7P
z_0!k_X;cI=sPNNv+;OOEArzW?$|99ga`lXJgrPvOh;Wqnt6}67@R<^Iu%sJd&9uN)
z#c0~lk8t~P#Ji>#S34Mu8&N_?s~wFfbG5)2l%4yBU`vlOUQR+^GeH=wgdsYBa&H@S
z&PM3?^QJ%>jNuL#6STe-n%&YV*xHw|jKbBu9JUVr4MtWemF)wtO3!QS04iH(k&V4H
z$fdB<cf(TK33GKD!A!ejgw-c%2;vMQdh!|UA~cntvJivJOO|PRMU`$8S!i}lNHV-9
zm**{qku_UK&}rwaP9P`qoE6xy$kadSkXS&AlO>*Bq_HoeRWB@{(Xq&L)%p2Fcqy-d
z9~TmI#aKev<&>1MVX8nWW3q|Y%*EFePNb2P7|Jvwy{*c;Sw>SCh9Zw~l&O?FN+)E5
z7AU*%xx8MAj1!fz6{*_=H$_I~Xv@;@%J!lxN=F8vrwmfg$mjL^S(<#g9ofA9UX*8h
zd5qu%W**IWDZla=_#6bbi{SIq_`-azNo47&c~3Mb3yKyvx>4XAL}7F#O6xYk*mDrJ
zk<$pTxsBM`dx)*Rjo|oY_y$kGv-Ajj-G|`sIs{+mVT1?IBD4D592mE~i8P~dV*ECe
z%Wj}XV(*HZs3!cXSKUF?%spi(`r6emV_rdVk+L7Fx|oGFr6yQZWg5O2f{)fyA#+j*
zKD~i(Fk4Y(t3)}A&0>XzvC?gu1EAaHB=qKHcUXDiT=LMA(KW!&y*!~V55MnqAt=w>
z>^8<n3;eV?507?6HzR?gC3rOa-)fc;jclFB*GAasD#~Cn8~A^$hy=Wh&2gnuqF!1o
zTAcL6%FbZ2IH$6)4li&fkyI{jbweE@Y`q+WvDHmG^#zrSE&I%x__}11?Xt2m7_=JL
z^<{*c77mjJ&Ppw8Ruzp_4ZAf5UV9Gw_AF$=DkMW{M5Q;jD~xqB4Ol+lK}Ssqs=P0t
zC9)V@i7d<#T4Rkgy{ZM+I#7yzV_KY=G2!s28duhtaA|!R&aElNx#<#Io>JlV$|Brd
zk%xQZd3Z2UgzJNaxZGKYo9$&doyf!e9tX~)wKyFv#a`OuA)4@3f^KWE8EXnmSe9Fk
zv64y*<~z}s=S6p(AMG@}j_d?lvIw=TDui>=@TiiE>~Ta0sZ^3Cz{o$hY|z}fk}-<@
zD9we}J*SuzF1LvBrG%AGqLfxyT{l-}A<S5*%`Tc;0D8t&OQZ_cXf;ehLeLwB+8!hT
z0|bDNmg8oWBLoS*Ji<>gLwP3T2oY>CXgrL)fht;DGb)oEuq8VAcdKYhDgJ&28iK6M
z$w=wzgd;x0___j)rq$3_uVlO&LYc1vS_0DKYet2$mf)*_+Exv%uSu~*ORYSU8-ulZ
z7$(MND^1SI*lKAe+!$Sz(%1ttL1(V(B2YVEAczg=W=7Zs{tcR4gs_vIRGD(;i_q)@
zc0Lr^<vE>`@5iA`)yrpeEuhsE(CYFkc%Ep<*@Rsd8xEP_qO5YF(f&tK&2>b~=lPYg
zP*y!*S&*;BOS#3E&vU=9pqQl)3-hH{y#RSC6^cuB$~FKd);l*-Q#g@QU?)q7rFh@V
zGrKC<yF81NM?13ePK`j2;7TSltP4JrTI|ZWOfkg;`Gx#n^7o4fQ~rHjIYH_`FdF0E
zl_c;iDGBt_vVcVcqS1^3zEZsd3-fgN%91hy+ral><<Ua(vk`(07ANKx`Dl9~zW+ED
zm!*+aQLF5ik?-h1acCJzt7c(n-v#TyG1!-#g=@t{IG3G=ec&W2yAQ+OaR8o<{RqfV
zme#%S_Z&rd<N`9Y574mvP1J0Cop$&Dk+GYIjb2BF;L9w#PU|E1R^3Hr)qNx<?jk*L
zAM*+cz5Id#6!OiM@XeN}WLYKI#ft#aYEh;whnA67S6+c~#y}k-S%t|2v)PPFGYycX
zQl7zCE0uj*?L1M^YS?Lm^08AXY(ZG;gq@u~Tgl%w5~xxPVw41)7^ZBKDCSvIQjAg^
z|2M&_H|pUaD8r1)GLA?(O6qw58mQTI8MzQLc$j*w^u)@=U~9&&HB#RrG(m@Sio{~b
zG&Cx@AEDt?S2e)E8&$`PZf2a+GHST2Mn+lbxX6dUl9AS`rp0Oq?GnU-1&rr8XsInk
zTT2;Q+bXbPB8ru(0$8=$im9nGj1Co`t9c=6V)GDj%|l~qAvUjaV?|FsHjEYH<Q6^7
zZZ5;cO=Y;Yr3CkP=i!z8#kjM!2=DH;<J}zwyuU+-*Ebd7jddD4Sz*9Kn&IPS7anA6
zER{IzD8WHS<%3!kjxz%7%hO{cZ?5q?8&(=qn5Gep6$UYw<v{ZiBO0<DXv*Qa$Ztm`
zuLWL$&ytsbNgad7N(&A$;;>?8*lG*-(rh%+b&NS|KpbrCH3Z+HoC3_xD&S4c%9k(3
z!HP)3l$a$`)@06Z4kKD#g$YHbN~kMcDCIdVrTrK}{9lAcMKFbO7A@^i<%%+@5*j=f
z+JY=JLu-&RFaV7`2CXZJVoQi63{_=}@JgV>nj-jWm8F&{<AboLm(vi(X<nnyhPnw&
zMqfLBhCg3sl@doe!KZcBp@P<DO*5+24-;tp(AV|A*g(L^5R-bEob<fb(&l8SNi{*o
zxNS($>XP+J$Ha5kX#+GnFJF^Py_31evd$nuCo4_o@qNpx)45DUxs0y4<uZ_a?kMLb
zn>LmuE7<T{$@n=LFubUQpedAdSxkE?Q6RS<U&GHCV_9V7lNa)e36EkL9__BMP=TGA
zbyCMVW2JZm!y(A{x&+gVJV#d^=}-Wa<il$z3t)())yvk6gjFoc22WmlsUC$i_8f~B
z**xZ&J;*XR7|s1C_sigC>47bQOij#XG1^_q$}5>_)+r{X#s+?z$|xnu7;WSAF47p;
zKoWYnVyhB!WlCBlUMR5Rr6M<rmvHxEk(#!rO=3Ys4HjG5P~abgu4)?QmaVXM?S-jp
zKlFr~u6ZxYS@g~OU}mXgak99Z_R`+>!_jpdv57m%!ULK0uOPbW0isj)5udn+sB~~F
zyM~l(3cmars>W|9k+*u~U7FtmnqCo4CR>oa`~nmZd_~2wjvS$<rg3QqzS1(33j7Em
zE#ai2VU+Wv8wfxtMp~Nb83Zk~HreV>ip^$Ij#ha_$FIxx<!9x0^m;ui40`47X$ZR#
zHCq<KPNiYI=KoRtKH=&hJfra#YU&8MhWa_pmWQrX?fR-Z)TZZFf{-l{Wq`3fmq^j@
z;*qGbN=k^)S~fK4?CV4C&<I-Ex)Ah6`1L5tX{tIJs>v1LYZ2R&)ljqunoAI>EQZHX
zh+t4fD3qa%k+iKb2dz!CxL!5Jr|noVWy0i)9&0yfv3`RJ>t>6wZcQoHOc}6ZScj_k
zJR0ddY@D%T-AWCPZn5CTK`)N2F2R*8WjHbQRa}`}jQi`gxUsSjw<fc2bLC68zG5D(
zkGzBf$$2;wT7<*SB{=6P#Hq?GTp;u=I?cH43*ln96UR&J*v|N{s>p*0Q#B?m4Oo#M
z!gxUtJp~>#7lasV*)Zg^A)M6$`(lD4FAY6y&!SBrK#-(jExdUfl_(??CR0BLzDP2r
zjMy(I(GYyJ43-7Cg-XPlPb-l5n+u=quqM-+q}MP@wwo-IttE{pFj)yQS){<tC>)22
zK=BbY?%H;^YT98<HY%~W)SaN!F(z8WD0e4e@T5^}WW{7DF$GX;3Zlr&?^iNPdzxSh
z)6QZ;u%^aP5g8zeTA*@P(JEt%-C+VRgfdGMT3VvkMrgSiizAGxnLb9<UKnb-p{L0i
zYuaEY=uFkkFlM9&R$xcaNra78E6*HtVg3ynU@fyuJR$xcz8;CO1bd-ak}SLR9CmuM
zvP4&o&Zx{_5v!9092UzyBvJ^wB`k|e#F*7QcXIy3;ELFo@F)$6^pLZ0d4Y!al0;gb
zTj`ONIkJUJBNgVkI^p;GkR<2?YDtz<CP5&jSwF<sSqEguNgWN35+EHGL4vB%=|Pz^
zOf=1Wtr=xJ_a(s^l+dsXXxa-+9^~0WsGzNDJQ1j!el{9@6biigx{Z_siOjNGS%P(}
znsrcye@S$X#Z$1@yvUah7Xe>hh4Oswt89?JO50Pmf)wx-dl}<{SXdTEmbD3ao~2OB
za`DxxVQkz8L*o``>o!AEy$R~fCX{A2^Lty_AZ<fs?RGfpcfixI6RyVHjJhWXI7U}S
zT7jQ$<SGI*yEws?p1h0H3Yy-E>qw7XMP}?8j|AU}JIIXRBlKu_&r3exCr4!)Lq=VJ
zk$^~DN&vD5ph~r63Yc^{0+6M=yc`uYHUZgl07{XMpF{3B=;UYRceI44{4M$00z2io
zB0twNYT9Uo(m5gzPX%`J@HL)HAdyO<ww|D4k>N2(p6D9dUM*WO+4(<|h`}EUA{-7Q
z8H*ECEm*pADTbCU!|;j~=p7tJXIC#{xQCI|16?Hz(@hvNej5ouv$h10$}*%JI;6ZB
z)YX*nBIl#6(}00tGcWc$boXXq#k2u^%kr^$Z6Vgq=3>LF3L9tD*i2*FwYe0V)~GSs
zw*)l=UPt3XtXihW_8BksOb2mha~1ZksKoBk3hZBD!iiNj+}jbs#VH-G%@pFw#6p}N
zoQG4rFW`KC0j>>c@nE?gZ%;e%q_+Z(nk~2)bmP3qjnm~W9Mw3ntDq7a3BGj|ajYs!
zU`3V>Q$-m@+#0myH=sVh9?6^r_!qarvZw*YOQOhK;(>+{xKbBI#9N1SoF>3VL?W&j
zf|!DAUL*rp%Z;qiMYND?!Y(JD(92h#H=h7pKnszwkXG>gp}JJ+#VW00HZtiwn^CyT
z6^4^wbyu}Ax;C&h!o~P#=f`&bZ%4cdu6Q$oRb6mJ8lf`yQC#6ck=}zsy$=O?!p2Of
z(cUTuIq9^}hkF@o+fhPrt7v(pjyPj&jMp`aQbJN=B&;m5&0!<TeQhwtx?oBZc<FYO
zOKj!q`J>S#=7yV;%eIIzXP9PJDrV<n%y$R*KV(?WT<1i-bWAY9K8Kwwv7b%IXAyM&
z0lPV~6WGbDhy^?^(!;ty?3hJN`9<2>OEMT-%vpinGm|Ftlt{~;%TZGZXxkN%q}+sE
zD8OhMSKt;W*rHWz;CWO!?4;3Wy^uu$WTA|3B#uBh1~cQgmT=avh-tpSzni7EqC8rM
zN?Nb8sgKZaMTtKJ1FuJ*wHJ2Uu7)yDYWJdoW>`+hm+`$iJbps98i}O*pSdDQOIGg*
z#nMbqUINmnth4u0{@mJne+s_&GIXUZf^0(ug|2oK`}?4ck3pB3Kv{AMs@NooB9kZ#
zPog9?gVOkFUZ+`9R;^=Wxe<Z-O^7yaMznQ1szy$tZt^N>Caxhca2l2EjKH185E#6G
zI1Mi|ep4|#0bhmyOfmK*m)}BS`5ny5%gbZ5EkscfV=kd5MfqI7M;HnCo)-mx8p4m!
zSDu4O5rF+?_$ekQ&jbIY$j`{n%Sy!3WBlCg#P*D4lM;vh!GJPKT*eNG)jdBpwKOZH
zC-9S^Y^PYq&)MQg$B8rGM=%~$;%#edE4n*6F+4bg;o)Hgdi|rL=<XXpH=$=OH^WBr
zG<x(f1kEr7yvnX<WyLvgl@}pl)1f9*hPv83^o`K)dP>pWqr%W=5oR|SF)>?#nT=)G
zur(WN*S&~U6N|BI@I|Z`nul%c3vg&pIks<9WA#J{W+#kT*>A-1P76lrO&F~<Vkn`)
zuHhh#tV-kb+6->(tihWn>hSS}6h6O}#3v^`_~xlF-r4QJqxEJyT33OaqlLK9k%ybr
zC3qNd;vy^DAy%qGWhU$`FklC7tex5jHmKs5%8FrCK`Vyx`_Nv{k4#}3{Dhuk5u@9J
z1{BN>V(|-BsI%NK76;%lv*Jk1%``lFz9+G0ZW9>k&6OKnqcdP}9xEuJC(nHrFcpaP
zEu@hwVr5-KD^X@}6wR&VAq!H<P>MXQiN@xb>-b=#RaY|38VNI#mmddEW)H&XO`(#;
z;f^*Tl<9&q)PVd_CyLAYb1a2~pMbB($Y^OzG6K^?9Cb92dPdA@6w?4zwm7XMj&c^Q
zO`s=*ao1eM|3R=Z78}AHG{Y7`tpVDYKu&sKWw?oeE~5;ikwMfli$u*dpmH-}i|JH)
z7-3xmogKN)dSG*u9@sh9En$3JEYVegyiD|eJ`GHaPV*nIBj}z3?nRzUS=LFStMW06
zG#*O?bZj*82+I;(g_1@bKA8a(RaWa(#-3)9iq*;BdYKdurVKpa5<nWRSd3;zI~DWN
z8X3JcjMt2_MOL0u+3d+1P<o!r<MnX23?R@s0&7hNw9#64o0p<$U<CowssLV|P3ma2
zCQkrrnyiNJ*BgvbMkpFIJ!M8#QsTAjgIkM0G>r<22ie7X-V?OGd?Ss|tSmq<zr>A2
zr9qy*6dRCw6uJ06+#M+Lbfb{Qm*?(7u6qFa-Vqe~$Dj$X<m;TI(XWE1dIr&kwWw~}
zgyyB&&^ovWebeXAF?Al*Bgf%w+YM9gRv7Dcpt9uvT!dd};2fgEmyi_8TX7TV@jFP3
z-$io#zCbS*QVI$LczH@GEG$5=K#xUM5mw1FK6#!%SZO5kvixW0JqO?OBA}C_Vs!+a
z{QmR5C4WaEtjS`A)9q58O$f{+x{BEe=%grP>7*+?EmLCV%<nl2rDs^o&L4{)&X%vS
zvy~CHA7jf#v4Wr*9U8{KzyJ+z0E7HmKTWTvvj--1IqWpi$}r=0jPMKlVf6=4Qk;j<
zyoGRS3z6}w(AJ)ZrNjB?9#CUsxdF3lE!eWvirI|@Y};qXfm7w!drXC`yH%K)&ce#^
zh1ju8jdN!$xO~Zl^QS{Nx;Kvf8=J6hD1#0C8PwZ~Fj?!zg|+>7`QRuXAML~S9o2Yn
zAdLrmoOrlHi@WQxaC_APJYHXnJF5zCcUpt1Vtq|{xRofxE5UMH(-+{FJ`cyuWjIoz
z!+|^#4ly?DE~&zX+;&Xm3}dWl2CYR?@DY4^M%(fQjFSs!GYccoF7UvR6NDkp2dgHC
z08J^yn_d<gReI;D=Z3Jz!|O<b(Z%j!EYe~TqwR}1xp*lzPl>yW1bVcR#f07xLQjrj
zEYk5Iwo+7XrERgI8Tk7ZP88@Ij8=?=Iy<t|2IOdrDAd^qI2TF{0aSXbVR2?qpmCvC
zOX%s`{JxvU$4E*0D>TeS-6BhjC5}ScUy(V?LL0NNVp@Yx%d<dJ6lIlF3iNa|J#(-N
z<_Igczn=Ccv6Y|;rIjL6yp&lZGG)#yW+y{W_<w9ZC7LT{$I72WJIa363!AN*+iOC4
zT^Ics=;j6r%QL%|2)DT+HutR1?xYChUZT}0<(VxD@ML^snT)ZsRw|u<(t|3`>|{!0
zwe+y_<22!yWL=4+Qiv0F$!gi5teRGyfWsZ;?|S(+oKRIb*!a-43Uv5taXIGmb<0ZE
z?zT>ZmJZSGS{V1)7^Iq^X4EZoMPZ`#`kK1fK%^A7mk@j^zW-ta8%Y*<25z*ulqssx
zxgo<{WaX6v8^>f?dW`Gf@yGc89mwVXUQ%RaBUnk}b@6-!u%tYK9AgUk)>`D4tC3}{
z!4gvq7MtoBcUzES>q3F6k6<1`sec3&p;6dqg2C#QsBT$}o`J1cF}WYp>rP<pwyPMP
zK84!eUGUbegOxUDuGtJr{Z3e$_QKwJi10g(VE;M9Mz1M@m{Tk7BR%y9^Hi#0LN7-t
z`S}9BIp`G?75))=e-1vH-hYDLe=dJ&bn?4mk@B}nX^0Ya<vE{NT__S(U?&h0s7VpX
z$>-0X<<&RN%^Z-g$@c`10!tYiD667$6Ldo>$1u2pVB^QbLjxG2;R*Pbu?&rlV06WD
zemsWu#uivrI{2m6IckG7;eeUYbi~=hS1^(-dI7q^#R$1_v3$G&9eoS1e8Pl{n?hK(
z#)@6rT{v_&fYs}Y2|G15ZZE`|b@`Z?F2ttw1{~b$!l@GuoIB&d-d&~Gz0HK}vp&p@
z`mwq{h{0+rwhq=|*Jur6t{-PMMR9c2i{tC;xVFoN*N)rq#q9uoaMO)%oYdnR$1He%
zw-s-$G2>oaF<!0H;LS)ep1AXI*^!Tnb}i1R%5f~G5(o3**h#BfTeJeBxf58LGmiS)
zF}QOXU$WX^TG$BFf(%TIx9031?1ga{vVG9!d*G-DBkZ9)(0Woy8OJrZCxXn>5by;e
zG3YIo$Wzf;m<(RZ&c*y}T3;58FPp~4N-M@E(37J;Pxf0VCiIFcY{+Kh$u2R`;B0fH
ztP;5zLQh?Rf>H|#)RnC84wRcBFjx}=tefC-qOgob>m)E~9TjfmNilj+XbD2a7+YeI
z;W2&`7(B?=^JjQ0G%|`BX_b~VG}daAyP9DMb-@^HMVY%AYFAPjW>e;iqtq8uK&N)|
z=Us%IBS6^6Gduo0Sx`YtNl&wr9$11-hMY(*>=K%jEZroaTOje3kW*GOd1iG=M~|5C
zA7NL(h9Vy?X6NCBtXv-R@dD4=OSwE}Y%B^0yHc8?#qNYJ7*g(iW#Ye#nd4D9@a77^
zS0z2}b3;;OmE%yHHtyqjBc!#oENNi!*|?}HLs(c^iLa_m_-d&c3uxT3he>$pAR@iP
zD2-Gj*Ad2Iix2rc=2(5mwR(6y*r@RR3+Od6HO|QE&e&dRaX?2~R5Rx4EH-#TVg7%@
zSlMlluO%h*wuVWqN&%ln$7`>Z>rF#u1D34|Vv#n41!ZB(XOZIqZ4?W2aV)CHAj{N%
zyvjBd+Pi6_y}TZMDEIYKP6iRG8bf2pYAhSyjm=w6;lQCA*nj9QCfA%oWA_#WYGz=s
zS_5nKMp){$!_u%DmCXm>Xg>;f_bK@L&ci==8Nre3h_1MYc~+~LFj5mt@?0<PPm09c
z|1dt4fUmd&8kGj6YOy^Y39&yd&p{_}d=5<cnt)JXC%wA?wqUVX?2iz8UgYyXE%NM6
z3{Q%D-^U0muoGCecXnWCWC;BugXkTdEB!1({CtFFH_Fo2-Hp+aVT>*x!_>?)p*4%1
z<~F!_(+15pnxYrZc#z;r!p@6T;r7s4mY`IV4Zk-JD+s%Z869S4J(!trV%Nqn_H7Md
z?+!aQtt-RSSRt18WuvQM9(o$*W3pe5)gx9+4r(#8q7WC)+i>B6fyIg)Tg$L@oeM8t
zpTg;V-MDnP3wO^p<E^{hczmrLx6fAN%3%+lT=e3@o0a(PeKWp$U5oEs*5lIydVI8A
zkJoyN@n&NYP46W<w7-bE&SKm!Rp3gY8Rzq2*qhUd+5DAQR=ySOn$>9F=e0S_NaS`R
zn%jl&;wHEkrD4t`^opWztJ0{<3&UCzgwK>h%v*zah&OMXmdBVh2R?ZiJ~yPr!^)^L
zn~|$#lw~9K5-YHPPZlSbpCi4yMTDNL)22oifhV&$3K+k0O7#4m?3iDHtYR9HwEjhU
zWYJIsjCm>p@=DAsR;aYnV;n$kk&Rzx(Xi66;uKV{;*_({APQKq)s7I#Juybw2rK5?
z(3ldL4rdLZz!XHDAw>I2qKpyP;BRGNtaaBg7AN_;jJ~uQ4Iw8BBnaqA?9v-c*h!=l
zv-{KWbqeU_p3}){GqMV?vQFJIqk9fH>4jAaAt$}93wTtD*quBJWkVrl0pYihU&|sO
z3pM4?8LhB7-0;YF)_7b2n~X87VI7d^lNlOaN(v*S%$i9kuw&ztu7=AUK&cp9MI|a+
zVVFW01?a`@I2`pojI8y@bw-hGlR3Rn<+-K3rUmYX4qBBTFY>%DG_YZDhG|_cEGFRR
z23Nb;K=8Z}a`N6~(Fp-x4&khpSz56aOtSE3B&y8zP@u=xDsfk)sU>Ni4tEfGLbRAx
zpHu2!BjLhJDmUg8u@t&!pKgBc#X?zur96o&eGPI=O|-yvHauNWJC{Po*lP~-!JisI
zUHddfSMI{rUFUH0<X!C9cO4VcN6^%@5uw^?I5MkYPOpQpYBNmryBKp1!QOrh&aP8%
z_nd{h_Z&R^7vUeihIwI{lhtaWooNWEB7!a-xw+ZTtZxo{Vrnv$PoSqRVZ<f)1a5P2
zSIqCvVJAN?p9}1yQ^VnKK<4X4A`#``r%doqKbr#cpJ69fCnLti0HwwA`h7}|RU{UD
z2D{;BWe9_W+`uxzkH8yb86F<QiqTO_(CVhf$FXK+29v9%FtaCu`jh|w|MW>jK~!cn
zwrt#l!L}~M*@7o5ZlwHCnr0GCwoH}0flRSD6}%AT)&hhXBUi0;VQRGrn>K_nGw#Cb
zWe)VU=3x0q7N#c(F*#g{bt`PxJn6)tElFHG+Kmeb+Hh)54Q`yR#ii3$y!%!Z4<8tD
z>$(B2J*vgqPy2A|LKE&^X~TnS4S4HO7hb*FigzA$;@UAc-nbsdy`#nW<fa}!ePqR#
zmrC*3-Xgp^z675PtMO5DF+ND<;x%VJ9<c$qsqx?{4ezq52fK=vW4vSv1NQZ3F^{87
z)rq#k7S!c;peDZ!>1<kZRv3<wIJ_0L2x{x#Q}JsR1e>=Fb@2|QV~t40B<9X76)rci
zTq7BsFMFTbyfgu$f%YX3D^e=2!0$!ci&Sg@-y%jjvAwL~GJ>z1$8v&@M}lr~Apw{#
z<H7`b1czdWWymTd5DSbbDs@3!?kDhQShT-9jaWt{@@b2?dKQ|OEMcXiN$92cSdm!_
zjHiZB3R+*16(-6E6+)gV2AxlOd)o-SS`=Gj1S5^k9^%Jdg3^Z)g07fWBZG;Hcwfjf
ztUSh8#q0<>Sx!rq){;su<xiJe|0C>_=X7&Mr}S)#)$v@2)ycV$vOvxaVYh_mERS_Z
z#@NX`XIa$4ABrfe70Z-JWf~akfW%k<p0cnluY>GLma48tC|Lt{Bn6u<3^gHKDCf9{
z=TF(r(MV%*pv0esrM3$ZnMd3?in2r_mQ)5YpWpkcnzqV@Lk5P+LIz7%4{~L>d7cZ2
zzcT+<M{Cr1<s3PXqvQK8Ek~XJV?dn0&WKzYcEcE7Ow+3jgkcMW;0%W0XT4FTx$!=c
z&c<YN4$!eghB26pimVD##F(9L#8(Qem{(vY^c<L%XH^y?m|JfziY2-fvUo4%nOk@b
zJJ}F*vvKO-HD0QW%}g{gy)4^~HCs+$&!L-4SSK*FY&UA#))IO&z4U4rlC#ifHpASo
z2l~1_urwcnqw^$zcNX6M3-AqIqU{lSa#P2Xab<`8a=lIgUvaT?Y{=L>>CK%hh0?2A
zP)PWR@y)?a#`sB*adpo>mR?>N)*?er<awRIN^DJlCjU!`umoO`t=HT}*!)~tG=5D!
zPb3oX`~8X)27^ImBc9IAPUZQWbWDtlj3~$9;UUHB6z~np_Xs@3*_rW`n4MUKb<?Y{
zVcmMH+pr!xckg1{+>Mp}!^l*6(d3Mw&Xz>no<O|1307VxP0$Zr#0Fbjg{EF3R;+Pi
z{T4sguM1+&wi>Kn;ls)iBi3&$$EK|Ybhc%qF1-jFro-60#*ZU=!?<~&8jo)_;P&|_
zj_uIm>M=duec6GxA6DR#H=OwBsTbdPwHB{jPvf-*ZTRfNalCq~8jsEg@aUuyPmWdM
z?COPh<4_sCcg2dIT+rityL0jB$`|qT$#Q%>lZ%i1YP`kP_K~U@57omsw|E#QRjaX9
zy&QdIU09JGz*1i;ma4lks2V|2Q8zOA)u>ekkyf+e(eYffB@LC<B0}&rur*CNS`e}^
zs>EASSKWr1>PFfHuZKL>V?~tlUos>p&g<t61yN=)GkWSEYws<l-AKiLVM&e>-Cm;g
zEfDxI>drmeW9%y+;8^D8mGS#!JeCp?&w#=2FUXa@BZHx>$StuElor~ck%qvC#L8Lf
z3_$A-ql`rtW-N~;VU%Mq3JX7W5Yp1)D8pb%eQ8FeAS04YcWZ$m+)S{=k!zsM$+Wmi
z#yOf?5sj|UB(o+6R#tWySND9li9GnvR$xc7dj`5c&U2;7N~eSj*Lg9wfH9R|<56Ju
zV$QSZ&*O_(Im*WZyT$p1d|#9VogR8d!%ByXX(xeHLe{X$FxAwuZb+v<95SOw2H1<;
z$(9A_Of8}b!Y-7Cj#gV>a`T)p=JT~>%hWVR<QEHhUJ1Py*g!0?g`iEfz|}sYlzb0i
zVx`r(6Kn{4$kN&1OgF)ps8tqvmF2BUJTm@}*Of4JX6oSP|5R5xmElKu<pvb!jcj;~
zY=HDo@!ks&!gZb9a7SaXdVGlSJ<P4kCsvyLqnSn-y(`@z<jbN7`Fgy-RPvR=N&?Re
z**s_gjebFi3$pao5?vBGhFWAB8(ErBz<adV(G88ehYi{Q0`*g99^Qf8sRI~Vbr4;H
z+tA#-35kX^aAj8U{md}FuSa>+R+QK5grR;POw9*T$%yM@#1-)Q1}-2pa)r<%Xrz^q
zg#uh27b*<p(3GlBrqw9+C(&1<VG)y~=?V0TWCjQAPI_PkbkB#JNautMnsj;G3iY4c
z+n>f%`CPtNSI798oI61RJF!3+*C$pe;A?JfMn^{n`uh45$c@tA1b9P3gdK}OZ*+7R
z%a)H~#rPN|R*qx!)D$+(%wo&zdTigc1v_`{#NLAkux0x;96E3i12nzfR6Saq3A9?O
z(d=#@>?3gVMk|f@ppF<|tIEf+^?t107RI`*VeHyhh3RQ8dV37$A1KG<dM(y$)3c><
zVbAVboI2TwvuB!c@<b5#@1$`1svl49#IS323GSRR<NZeteD-!FK7O+dKmL{vpMMy}
z+b;+3^>@4Q+RL=QOF`T^Zo{?xw867}+}xqV)!7_;a)h>b(u^N%Da4OxmOmtjKS|p0
zhAo1JrA@e1IEve<jX0S#gZ+h5Sf?Mrl(!u-4FsMyjS>AY+Om34ozsfeiZ-+vlSpga
zh-q1w7$HLHYJ@d4i0T{Quc(5vJcimNO}c+2Iy?Fl=*=x6DfZ1$!-~if<@NJ~!!X*N
zC^eW^`ATVW#fteY;!&PmePv0u0zX-$<%Mi{rbRQ#XT{9b5DXf;kfX*6x%?i>S96Lm
zZ!yclVti$>hCfRSR9O_@iQ#D&w+x{;tc*-%Mkjqdjq*sGwikrf<A**Nfki4;x*nEP
zgVIS+U}uE&*AS95P|IdJCciQtSZ0yP^e~w%QDETT*UJE6S`3XQhcPZkhMLgmlvSe%
zyTuCR{=DaPelCqJr|?gn&oSl;^pprY2fYP3^2{@buq!~m0=o)jHj^yh<qt;@i}PB@
zST{EIDZ)a=+9`vJ*}zDzd5rZ!hK9)VF{d{Qo!NtY#>B;CHrkk58Qi_3%!qlp#RP~7
zi_6(q$etPYF!BX%?i5tvdK7pwSYY%c%MwJ6*$<T~26LnaiOxaPFPlPa=`f7RY8YZ^
zcpF-f>|BbP-T}m$+o5xM2w4{l9xt?Z2g+;?7`%RXcnz8dhS1VKgj7=tj4rn_yD1_I
zjMP>WbP)yGGIU0wuAgbjXb&K-tWp8rygUQGDub8vO+23#%$NDOB_1p)rR|kxkY%W6
zqt(o~-G-dXPDbw@lm-T2ubxDtV<VD1oB2A|)BIK=%2?}4O~M?Tgf_AYrLnarO>IV5
zW;-hC_rTD&AJ*0*uy-7XOL?|;4#B~Tn3s~)<k_<unQ$Bk1z>Yj($w^G#UR5_=6Yi1
z#@NZw6Pc+oH`Y$Z?a4r68DA&Q-6WF!sntC%&wFM6*dxmeE8r8r$>3!v(&}}0cPr(&
z)k$Z>bKuPx-rU!vmv(%`7$zrIVn%spw+5TnZNScrTd{k~4(#5!8@u-I#h$~5aN_(q
zoIQ6A-OX(nY3e|&IfPby26d(k{M7kUw*@6ZD=MlCXk1~zn!Qo<jvH|7VmCJKiC}_&
zi)I#~rad3)5Bo5?-;NU(+Hn2$AkJK>#>w*`+<jPw+jkN;bWn{wI~L*5T{}%KiEn-_
zif?}1kGI}X<I697_}-U6eCt~geEdNQuRn_6@)<kM9<yWPngU!n=E4581voX6jqjcH
z;+IF<_{AC%e$wy2C+RqzI681uKa10)yKu5>H_q$#;5Z}go}y0dw%21{bt^X1wxYYT
z3SDJ=Xf7N=V`(26b)BdwPNPP{#)p9OlrY+>GYFKV;Vx!lSDHX&X&5!}PHbMk7dv<C
zSH^lN57WgwYUki5b980)ON@3NW@F_H1eIYGMZ72S)EZ=$lqkbl=6ZM~;)w~#1A7^N
zCRQTQBk*!m_$s5Zlox1&FXflwtJxaNTcTz}6zIrI28#kHdA4Mt&6ydODtMGFCuOto
zQdU5<%LA>K-)F_L#A{$l)I-lot@b38sci*}b=i6svdZl=94nzA<LL;wauafCbh!jw
zwuaIB8R-6KbrN6m#Z><Ta(`a<zG$}3ZB8lt_(j5g0WB|!iKsxHeV13jEK@Z7(i0nd
z7GrBrD|?j?dUZUHO0TR8B4&dUmt$Q$0`U~AF1|0qLZhc~sVxMB0rLoruar5EM|fp3
zvCN~5zEo}@7(y%&<P&<i)-a<gW2-favOp~X62J?kHZ0_MDDm<-^B!rPT8Hl0&8Qoj
zgrlw*4o2r_TPK1IP4Ls=g1q)7S$M*1fy!u7N|D}xB7+eozL&b5UNrR&Ak*3oliNev
z^C~lVBoRd8QP%Of^d`0^J-WeoHS{*dSm{m9)#4?-m-%@{789?%ti9*Nd{qDoN~Key
z8jERri)edGtQ~x>z0ic0DVu^@s;6PfOu`UfPQxFAHM$(e$T%Ci38=%<P!oEZ<R+A+
zw?S9Eb1vdG9e};<h|-hm?Ky=&|9Q+yRuMdz8YE>zzS0HFJWkE_`aQHD7aXp+>0>g(
zLW-T#cQ~9d6H;Pz5@Dq$^?5I>08PsC&!0n1>`saRP{!E_>|~aPjOml*oqBtF|7db@
z92^|{qt!hx^7ZA*mt!TZZfbG@Gt*O8yLJ|vHf_ZAEnBf?`z{>Zy&s479m3ILCvfcS
zS?oV^3Ma2!#_3BJm9cxfW;de2l|YB987WIf+1W`KcA|D=H+r`)5^nIJXQB+dPc-uP
z`p`OBh^FB}bQ6Yc%M56r)L`403?9F?39r7l9=D$M;Qkx!c<_2Nw(Tjw+6^z@)QMs|
zx@X2)Pwn{TM=^ZolNi4B4Ie)Jh7I5PZY93;9Vgy+V!->ahVkHX5Zk7Uapgz?7j}hk
zYSxU;&bHwvM?3KOstmr<k;MCqAy2JsxKuKPOBMTY)_4*}^}BJ@upS4?hOk>#kNx2q
z9BydGx@ZUbjRc==5Ur&{Xf+O^!#aREO*5kTN%%?<@DYA-T^*dcVdxjR73jsBP3UMC
z#qRCLaO3)Yw6%52JzuI$@p=jL7+H8z%QExIGaTA^l=kkAMwGeMI<pDIYzT8zYGlho
zmd^$-E0ACWQ(`RPCNWp$mdb2{ujUsi(32&rUSPzXx0vR_@&W<5L}Ot5w9)1QsG!Yh
zyo`2sFXNb>l{dsF7KEBMQtI}jA`nxa>FH${iZ6i@M@U)kZxLgaEYGy4)c9<NuyUo>
zRR-zG?1&{Ykh)l-462s$XI4kp{pa!=deU(t5ms#OrL1hiE(`N{Tujgl?8NLWtS2t7
z58+sh6`$u%2JtGB_E|qzWbH`-pR7hC_8ev1^Yi<5#&#8Bx0dFnV;t8KVloUTkCA<$
zp2q5kqbw<VZ8V`coK}{dFY+akZ4F{UxvZ@0!IBD|BM1L>(+~}~osC2lbnyn5YiV$)
zMz~s+qJ7OK3~btgrWKR$H@4CM<Ba-7EaLSmHX30I1W?90o?ljmT&B8gCcPyxZr5a@
z?XjM>wJWdJ77QUkfzewn2t>om7*C10^7v}*nQ*Q0JklFZq0H#SlHzhK%rEEp)GN@N
zPw2@KRP#%`cuB?gpiN*Q!MD)ZK<Krhz|#j!WDMH)N*<S^I53K08Peh#;<X+@S#UW@
z3B8igDilZ7K%Lk~+uH(dW*aIPbIlEVVPnj7wjG71`y_n5=V*F0O-K-YvR6V(lvJR_
z)<;SzlO()ih{PhwOo!)1qG*^my9^|j*!tY$<oG<k%5anC9}D!JkFOIz%0Og^vSM}u
zJFz!;W+$K%usw(0bE^~68yg!V>{epcs#RD+t6RT*odUb9+h}&XcVqwF{WyB)C{7+d
z#mIRUXD?sD;S1++^6pKXesCLCA3Z=X|NfaH$I%e0MciEnpRW#uRQMbpEw9dpNS_~r
zYdko3wh8NY#4xqlkG?e(XjoQ+>OK{Gb&Ij~P(98*oWkWN8}anx12}nS5J#@pW9u;!
zj$X3j&ch%+`Faz+_h~CWeKUb?zn#E0UJc-zZ-(*VD`tH0nFT-jo(tdp$c@LB^muY2
zfKw|BxIE^<gYh`tnQp@S%X{!%_i{Yu#eUhg3O6+?aYHqZ+hwcp(6RwHOw%}3F^Juo
zCT!Q%V4I^B+cVABQoR(@_7RL5$I(aV4cV8Y&pnDZdmrizortRHX?Cn!r48^DCEzZM
z!dIF?(8z1!YeP%z5H_yegSX!N5Cen53h2_A7{MoFC=)ai>8X=}s05!px8e1aMp6=j
zj46{%X7n__GFG0VGTNU?qVL?m<;8{kn6S%H>5)eW6&bB_56WdZrD7dTuLScJ<SS8k
zegSPuR(E0aRN0)+xO~cd*8=Hbb%m65>$C))%I=1m@wbd$(>Q`Cw)&9Iij_^PQ&tu(
zk|oroXq1@|GFbTefMJ>5CC}w#I-L~hh>+e^WgH#js@UA07G;=8mOQJQdwwZqC%}^^
zyCgrKF;)dFZAf6}l<gIiwIvfuucjgbJZFmi2n=LEZ%j7Su0yi69)8w6E#G6l*``eY
zQq%BM1fGV#E3?HIH3<w)6;$3TRHWMA?H)sDa8g<1R1<H+5*y<#!T1t^w@~MW#!VQd
zyU;nc4Z~XwqiS>wf35)~@p=@+>yRHzAvQ3E<-7J{+16czUJKL?7yoA^0c%zk6sfRT
zVfDI|V?mjQ;8W4|nDqF$UPOn`wnw6H66$eEjGkg7wilx5$)hs4H<ab5GV=ILI>Kt;
zp`~jLmB=pC@w}DOkj41SP?n%7nd{AcsWe1*Cun-r%0i=so*t-!Ln!tSF}e@X{Q6Pg
z8bG0=7bX0>#77AFSD-L3@&BXit>d#g*6wfWEv3a>f+QhA+}#~w#NFKpNpN>}cc)Nl
zKuf7Yaf`b<4yAs2&ROre_D$M8&-4ENxITN|nmc>;d}pnhYi2OWGzCK}GBDga2czt!
zVXR{T)Lmvli)v4YHdoJ=&<h~+%xtK-9Nb`KFI&5*ZfNPWR;IQn)`k+CvRg(}d$xwN
zW#RpAyQ)+G9tC_7VsquOz)!#^Q_Ex&R`TrRyQHt~pMw+se<9XZrR?I7l$4~vE+;1k
z)28L2u&@9#X3fByd2_K~;X*83vJ9(NuEB<No3LTiHf-3n8>{#3!p0AdVEqS&v2@o~
z1jR&QITfIdwG(vB+%VGA8SnGv8#9IHE=Dl+Gr+7BQ?Y78AOge3z(0BnLbEjxk~<NO
z!NU-dNvnEeFZQ0Cj}7~>aP;h4tlk-g#5^rTB=^DE%?7w|&L0;)^1}D0J@EZeCtNw=
zjGHIiapQz1u6^K)E2quy{YfL7-))Fb)>&bDiXk=!nPHQgDb7cS<CDoD_}Dof2eeag
zSaSvrjGK#t<7Z>n=p=k>LeOeOVbcgttWonq;b2STkF`Ldi5s$58Db2lAXY6BNfS~L
zI5q?^=82eU5($U#Ua;r;t;cu~dXAXb-wI|UX{E-w!d}}OM&le{Jkbd*c7e#vnu`k;
zzDHtGGMrtd+^v+bu~RM1s#<qVl`kfhxMvVbwhEMcK$MN-n#sN!np9$Hnp(=zN5jTy
zVlb`PU;=5#SXOQg$tBaph>5b<xFpi5(fW!#9**~E+oZqmb;9WF;Zz#qXyIgiIxQpK
z?54`<ERtd(0}`@d$AEFV1gI8z52JDzBug9(gUn&=NhNuW5PO{}>@68_Cg>Ev@m!|U
z$%t#usOw+(bX0M#(X#d&C0-@uUg`aw%GNU8WW3Aw52OkjN!V%9{TP~3wGn7GGFw5Z
zC%jhDpH0xoHIi$lY%@w=*b)NL-!(={AA>bywuGTF^-U6{-`6n28^fr8hG<fi>QeEV
z5{5Pys&7Y|=MCe)SUAQ{gIP$bvPt-B<4o|LmNoilTB7#^8SL?Zjw|cev_*)Wy9RdA
znHXs9f_~=C7;5jWY(eRokdANyFEnR5j9fk0V6aRyC7E0u!^e+hS&fAHBrPS>wB)%l
z7&vAGRp?m4SRHx<x*q>m1O9I&{NH3LdJWd^i3FmoQ>j{;kJo_zvkkc-+Y(3~pB@$a
z@KNe)C<wgwWl1X9-2PPLgQ@n0PQshR4e{1!GrXtnK)dda!TP=!ILRM<wfv}N{df#S
ze*&++b^woo1Yjrz=tW?FVJrrjBw?6Ey3)5d$~qfr1fPcU3{0fj)Am{f-O0<T_6WU+
zdKNIYbR_fye$L98bF#dU6;+;0Tvmb4Q8gRF$w`SGRoA2Jq+sRv&*$X*0=S8?vI>=*
zjN-a^xWhj%0KqR|C(5oz-Tm{iD7zlm{jKb>k)N*`oR~d(Hs&puhlPs?yA{i@a?NV2
zU%vsHH*LlC?YppT_daYoZ~&W69>x59+mXI<5qwkQ5tI~<Wvf=f+|dnVTqa|X>~)Zm
zf$_1C&<JvaZ&m=7txv}E>5iBhG7j$kgW()H5U!C!F(p<L^H)aW=&^P9^rNl#^warR
zyUrK0W?G_Xx&iiWn~aYS1>(v_5%_AKJMJD2z}@{*aBI5{igyL#`W8Q2+Y*R#OWbj6
zx*bmCyW?brFLs9nV56%)HdzH=r%@<Q7$)QBxD<ReVGcgiT!4>trsIqO6`r;i&R7Rx
zw}u<mjc~$RHDAmf;eweOHkf7Zi6mn`1gR5x<H8X?F%BU{(QwzB3I{FP)QOV`Ja-rj
zAdrTd!A{c|lZ^vlr|t%$QFhQDV+&Jl7fg1K#DaNi@%iW9AwD6IWhh+{wk&fh7F!FY
z0hF6rY>Trrf}-}U*s8Mv*z+E6WZ5fQ2a1(r#kRDyR~EB0w6KOQTltClW~>;dP}iec
zBg952b319HXypb;S_&<#<nQ$vGLp)#A9@iqy=1^Zl$q@OHF_fLj~3pNWy(iMMH$18
z9sI{rEfAWt$kJ!{YM&lUDj<_YU>R8XJ4dePd6`)v{g3}UUZZUl$o=n-RFroJ%=f4g
z2NA|&#!sNS)mLCA`^EgV1QY9sRCc9q@EY|r404pN98)t3j8iA**Z{mgT7xQ<AfP=T
zOa(T`*baTP%+PDVc=URIEM6TT3mF&?`n0&Tu!9V&G2US^41D4+$s?B5ZVLM9I-$S5
zGy3T}Vt}zTCb$Gr(Zs<cV<xO)vY_P~3C+n7&<Tjec&|`Q3W$QCZ?LkbuEmr9=u>Tu
zAhd@QdZQ<5V$4K!jGUkbwTa`E&@tA-L_=K+Q5%UN<3#OgU=l@*G0WP{hu`7q0v$Ry
zQG3$Wp=(52&(~!=J^`N{uZ84%%EF~Gb##(GAwF6U?-O)y^_{>*hmF-h>Dy!Z4xfZ~
z#u%f&rX3qOabh0mt3fq8&K-T&c=y%tRifWSKlCFA`)LQFpKb_ka1;g^#bYRK?r`%|
z+W8EOwaJ0HLjg2hW>f7ggpSX0OcLmgnqb14(HbUJP73&}C`U!@^(Z~*lvITj=-;78
z_J2eV@Wk>;0<nzb%HW3BTgk5r2@6GdByDR%IKm?$5gJA{7aonMh-gHHMIwTBDw@y7
z#l|2ZAs#7dgk5GDa&vPCA%R^H=FFLc1q&8n(c;Bev2q1guO;j@ZN!%C+p%-!ZtU8-
zAKMQd!RF(~vH0MAgv>62S8gWUGm~K#6$(RlXQWXjnz_0|!*43oLL)IeFbIA5|F~zw
zVf%+WP%t|bY3X)I&2xm?R5e)p^oQ-_LGTRL#H<C;_~wh_ID336PVCIbrp5kPG0Pdd
zR{7)OJt_ENe=@$<5sV8Pz3|0yN1QG&#^IDnIGk*NW9g<imSKVYN#@udW{hoqrr77_
zjx7$pIO3j;<94|?pp%A=^@{Ml`BGfaosBc&Qt_c?IF3zp$KHu%*g>VUZmcmjX*yxm
zSa&Q`bHO|v2h29}L85LTLdFLnStkMUdWo1~5CJ<KKiKQ|!baT_MuVvChFZbP&=W!S
z!En^^g7Fwf7*1ddY~~L`12;BaQ?Pj1CVchn6$A!{uq-VIK2htEl3<~9xcBr)SzB1Z
z-o{2rqE!-UUrv#-RpyAQ)Q#NqD(Aqz;pD2^UluYez{U~A)^?a=W(f^FBaGFWgfW_`
zrM`xa8K>m3$!M@Buh$5gSKl8<0MP#S9fn^02*m!Q@y5VW1ma-4(nn@Ls9^V6pTPtZ
zmE8OMtXNVRU8MzmlOPk|NhtG&88YhH12_pef8%`_oDenFhatzmL;CP!lv-^2aQ;SZ
zT|L6fT!EWvI+mzlNyj4WMB$0zvmx|sMDek1@qd=7V(QxZN-}YO+307`SoBplfT2$?
zi~}NR&7APAt~uV*vA`=Mv{m5iI~H#Zl^xLdc<?y%8a7@TjUA-xg3;yy&~OaLSchN?
zGA9sBT`}4<P}%Pzdg&$v&RP!J#5`#G$6&CnH%2=9W4tVZ?@POAY>9zX;(9J@_}ttv
zTDm3(EDb|FjMLFnU^iB4JXQKAmeE8^GS$a;U0Pdh4UC#N0fu%qFr(88WPSAr3dBSc
zS(rcrCanLmK<^}Df{*u)^aCqw;I-*VQIjca>awzq^gj+D@MH}>IS$gKve!i)Nl+he
zh5i~2ygp8B?CkOOC<pWz>xTEodE#B(`~BFEzpu$4{NAVf>!%xvfd<jEx^Wmnt2@dv
z4Qe*3WMvJPnb35f4=tZ%(Dhq`USqVJ`A<4PL)RLnma=@b2SG)M@kVwQ`%0jRofYLK
zuoDGn#};28=qw|?GQ7@^bD3=+qr5UWArt=t0s<7}CMxcq0}|3-6c!$jh_FONhsGl=
zBo>KbF-VS%MQU;a(lU~fm6wj`Gx9LKFb@mn&%xq_gdKsmW{r%*uEVBHo3MHFX6)Lv
z3w!tO!~VSou>bf6SbF#nV%M#PPGTJNQj=gzTRhP}09up1*)qCf91C@lwI$S9Nhdhk
zLz9JMLa2Xx{ReE`u?j(97Vw;^4YxQ-^blRR`e`CM!44k&#yomq$A;P1vtkBv{oPSG
z)dhtSme?@E9~%q9ux&;tRuF2d;&iYrc@lOf>El?YJq{*YVOyj&4rl1&!eT32U+0b=
z7I@-^X+ikZKNO!hB;lNC9=<i7kI!_c;Uk?SoYEsmjlFO}-3BK#ZSaAiGj?iQVx@)w
zmTDSexrQlL>e^$yZ6NXqu%rnANS%~~(6Lbn)=Gfm_^EIw_-rS5!F0GS%m$ew)FK#D
zOs2p=*B!be2{NM~IJ@%((V}fKVcB{IV#D5pIC%0D%i39KnI$n<`t5AxhLy(zm@Kz%
zZAsfpu*rh0&eH#DV@c>)!ATZ#wK3;SF3TOU;@gQz<hiIICGU*)6kC1Ou3IwL^)h9H
zpEofxfvlaVuBn01yoZL59)m%{M&do`7aTB@3XN9vy+Kr61M%v+1OAHF`1tic0y(O>
zcl+Vh-ZHBCH{f~_sC$yAWw8Y5qf`3pdhD#gt@pclo$=Z`@9@|gZ@k+Z@AQ2i{bg|s
zSy^ZTm7Tr;%q*;w#W-E~Kgq^ZQb&}LZsnZd<Mpw#VI{Y7<#lm|uAUW!%T&41lQ2xn
z5@QJ`NvC;_>TR?&8>E0}n1>}`ysNJ=KrzC~8NFG~y@rfMul}R(8e#W3<5fcPjiD3p
z)<|8dK6|{bW`$vve$eoVfd-Y`SpO)Dql&Z5EJVu2eaPH=5Dv-H(O(u>aQ1^<aICUN
z$a|(%cwbu=!>CdxSeUciy)c%bnrLba9dl!-8)#$PBn^zyRmX%$nwX%gLEvdX-@*tc
zcIMDA<27J8+j_ZUYHSoHhXp~;(i8?(W>kceV90YFBOdu(mUeUk3iw1Asl<`YEHSqs
z_;gG#aD)aMld*W4fPZ^175oT&^c!n|KBFxNdJEcJOR9Vu^d9YiciBk4JBAH|nv6<Q
z`LQv7Uvmn<=TGp_@)CT545Bd9L<T*QX>rrpAmn1aa{<)d=Ri}LrLmmXb-W|gw4L}P
zU0`hC%$v_ixjCi7)mc;>;U}_O`tJmK)?#^G_&k5SjKE6rus}~17?7D6@|-{}I5_yP
z5!W8*$x$AMg-0SJJeJBY5iyL!m?WgfCnGyG4S6}4$e*5r0s?Q|!dY0bkm`=mTeE5<
zwr<*tjq5imHg?y}J=nYN0QMa@g8e5?VAqKgn73^!{ALtFFD@2(X{neH8;j9_Q(?rT
zbzBr2qe3vwSciqH3j=!#7L*ap&5W>q#ZoMrKNtRfF7OUAfnBf;w1XzXDa#R_aTbV3
zb3s^=8>WU?VMewO;=K)!;c15pTUO%9olo)P?it)Uy#;qpuEW)X)A7yLIQ+0B0XMg%
z;@VmQZI(YyX4&D>A{$)a;Ec~_>fmtL2<&zqh2zdPIO`UIk1a#-txFotTSw!Fwj;iA
z@xV1tZ+xI(hNBa#ut(hjtJEf8tCc<0S=(Zzu?1F}5xVLQD9{c-wpKKv$HgGVAPc@a
ziSRLqgU$HKFduG*Df+(f*Pns_BR^P-v4Jtm+=9R}G7n)3&kAAUg7LPla0-vYqRm^7
zoSRE1xv=c1UZ`B`SjKh)p4d0Bw#pJl3<pU`V0lZfnWSDg%9?IYyl-4Z@tg6pRtosU
zDLAoFR2F=dIhvvt2|GJ^o#2ztITM0T{0>oz*0e^l$&{(7IrI&Tm6;3Ur~*fhRmYG~
zV_8u~(_*R?ICzsPPzD!NA?YKkSrD(j?0@^;(F4EN-t9|Z^7!(xq@O6cbyR%s_U((l
zvS{a^ffzDkIMgPLr;VMY?Ea_BH<dZb1e=qbvmQt~k8}k|<rYh8Edv!YCsv2neY6&T
zvkv-?(Z_pK>3uXTX+xbbz{Cx`^&K$M(O*$wx`DA6>k$b3pcq(3r$WmskapJ&?@Ta2
z@9_qBcY-0_Q#YZVv!Ft=!$>n)JUtH#FrNZ7?<m-0&cf8?n=pC7Dp+S0A$ZYx1kYQG
z2_B&sZ0SK;7Y!o{41>^En9$*jHs|+fs$-0i0c@xE(ET`Jf{`B7jj8sGbqPK##zaie
z)4&+5u}WyM?igAd!@$}ER&Fdy4;KY^ei6a2^K^!;xgm5+^kHOWLhu=5l8GMl`PiH;
zL9UOC>`ExAn2o0a;X6is5(W&Xg_qn+s=W6Iz23tOm2D&67-EdqhnV8cVHS9Mgf$^(
zi+8E|dXI5H?{RDx#=8-G-n=dZpJo8B4Hd6`1cn&JV3>I#8xPuCyDW@#%4dTz0~(&9
z_LfpHkF|rE7Hy7&C*`OcZ&q0-P_<^As^wCtK4ri{K*w`|pE8)iR!ydt$#gTxvy+5k
z8GI19Nxz)LKUJQbON53~?L`rOajd8*iAcyyMOt1K@}^J2teG<~Z}x1gTDBAm=gq^q
zwQI0p!v?Hdw;o$JZ^fSNdvNObhuF98D2|=^6noBmgoOw8AZqm@n5D*2zI$WhWIq;i
z2>R1Pjq&$|QG5iPQ=*{hKr3Z57Q-eCfTsRfsu)WY<fP)tm*=r+;as?R(Ei2S!zfA*
z7SrsJax51?>w_?PmLn2odmt^#8U?9Fn3HOPEek_%`Sc3Zezg<L7k8rl%N?jWvk2er
z2*JfI0r+lnA}+3o!^K6xIF(_C50WjhD`)~vrRw5P*eD#EG72ZX)p3e8_hf)M&L!I8
zO1?MFhneA9e{<Y)HOCJoRyaG+6UQ~Yalp_Cdu*Ms#>yIVjZ87m)DH8EJurhZzep<r
z>6!@$*GPh&UK$)GM#5P)gsLtSkv8E-aEyePwj1mTJCo70T!#KIFq?{rmVp?kLzQRE
zO5rvcu7qCVw0t;vcq$E_Kus2GRg@mVWh3fLmgIMFQr1+nrkXJ_HiLnoF=XDTp@}iH
zsnlfMHvxc>*T?%uRs@j_ehaaC4s;W;G?4@EF?$*0;&<5!40(U?|Fh>~2R`pWD=e$N
z$ltKAvV)0*wX&>{u7MG>^b9ag>!lr5n}7i$#^C)S!>JgCD5)}U%b*0+o8-?4%zE<a
zlr)o<@m61%FRXg6gv^?FpZ0ZtER`+`YOoQXFmWREjEv}jES1?Pwqj!mJXy*~N$O`k
zkZWQ|&|5fDO*`7ah?Y+Fh#59^oH8Zv6)I8bcYJ5GHr`OvLvI}`47T=0Z&Nq)(y_(s
z#?E-(ZVHBbhC$6Y8Yc01aL%3!$F!L+3{8cmPZTD2Mqso{Forn>Vu(WkhS&vSxOFhp
zUE}$B5_E$z;9jr{nR`zmY1?5KC*(qFY61+y(lEw35W172U_3P*o*9M6-MkkI4jhF~
zLINhz@r;#ytSMrg{3m0g8KFg38Q4;>Dd3w3HJ$MoqdktwPaRq&x`dk}rpB=@C&$4r
zG8Fz%p$Lo#gKv1CA|eb4L{nRHSUOt6h~K4WtVc1`qxfDn400`HM3?1kU}}xAng+@o
z-TqXteTK=>^4iM!d#?`E!)t>K2|g1lT(g((v!d#=!`ox*@y=Lhyfe<74T3k7p9*}U
z_6F)lQticIxJ431*<?V?Ax~L$O5J@f)!t(C8amz{qc!cIYb*m9KD?p5X+NZuS4s6A
z=&59>jNGYyw2Z(C@MIwYeSLjps+bHu2<SxR^#JaFhobx<!)X<x5t|l=#5@A8FdI3u
zrlV-?ELzxwSg~j+RxMtRbt~6k`?j6fzGD}5?cR%HM~>k*3+y1{_}MS9<%2VrMkspa
zq(LjnAA@}8`>oCKKEcqRl}0n1ty@kuENGQ&<HKO)V+VaJO{h;C1U<cRD4L#$8{dD8
zPfj0)qpK~9+)ZGc?h1o!{*TN2G4*gdT($(mbA>l1ryC(L%>WDYO>pac5w4$~hp&$p
z;?kiR_<mO&F0PHkcN@cTeRm*%=Z#adU9l=y3!lvJ!roYY9FEh)fk<`i4pPIRpmDgB
z?~LzKEO5EN1;-P#u_e4eKF-y`g?Jr&?{A0;`nvddq%l5Y>$P{BFV?EtW22=j7MPi%
z$jlP6Y^NYY+Y@;jQ<1M7hb$_*c+*^X8zjP+g&yh5>*|$;WCAaMiokZ94P1?UVP)VC
zePe%&F!aDkD{qXmpRDWyVd)zL2mh&X^qx#85s*}Q%DhkMld~6f&wIh$0p|ARY{4gC
z{6uwS+48}|hENqz84c=-{=@oX@Tfrqk{U*iACB>?EP7^otORCCMKHCsfEg8@xt%Rc
ztqBW(nu9BB92qPx*;-Kgza#{La#S{<At>$H5X<0*YSO&}Os!a6me$ZWvw$|CtFEi3
zj1&(iY=+Pl4;(QPvMS8`gNINR4pxqR2{cs<<Ljf87(8l>@_tzwfAn~DWmzXlDAtjU
z?P=i!aJIDiR{Xn`3=3WpGpbb^DqjoM4-@_`MzrU8ww6#gF~Fd4YItJ+ZS#PE=p}mx
z4;+C$W3;GB^zqtQeY~wJ(DT82W}bLO#{s>xZ1JjrE8eps%sir?PJrn}PlrwVLKsFB
z(s5*Cf^RBDy0a3yQ4R83Yic$|I>kcGErAVtFb3EKK_@H|A<MQSVbei6n;e)Y<in?6
zDO}TM!7MCQ+3?6UDHpSL9!B=c_3(&_hRqaj*fBh*)<RR_Ve9P*6DJ$!5?<P7`p_`c
z##q|i@r0iSRh|~h($Uu)N%<Mboi{@PpLd8q9DL{!Cey~7>nmzdhrl!@{LE>EO{w}s
z?HQRHv7BY2Aqpp|Jxex%Ql^r_r*C4d%sLq`LJRK>*2EkAHSqcXExa~B2d@v(r?n^e
z1{*Pq@#aujJ;oAm$&&NZ*GH?{d#neQpC~>*^wkbR|4CsOWEhR1W(hCVUN+TU0VcT4
zhNkC2^ctz*h!M0;qt(q6=uPntrF>T{5~vz=5=AHdFTD9*hJepn5)#LbRlp~Z6PWen
z+zI&pX=i&Zt{er72?=pX%Sb|IZYpw%a#1+92(uT=#p1<_v1a*dY+SVgo7Zf{zFh~f
zW7l5nI&c7ojvvQiTG<aze1H$np2xBeP9t*VD%izP!LgMcWUPh$mU<X6*#(0sw})kA
z!D#Jzn623WrxmLZv1|#vqeH144B_KwinQo(e1G9f+_`xRJ|Te^=VF4fAvVx2^v8rE
zFBq;0h3Ao6xb4e;@0NJDC7WW_JTH87I2GkzFGT721t|V_9!fu&i}DX=q2x$5u5Ag&
zwe^#6bE6MV<?G?YLJRClGRE;~wz$0B70af)f-M2V@KwG8Zp?PZxeQ&L%2vmkkaw{z
zau6;SIpA`b1I`+o<CBR#_(&rhhtxx`ODi19Cj??4ZR#>tFH9paQjGl&&%#aCio!J0
zEToubAV@D3k&c<j3@Aj=)agjH35DMTd$_7u!A_eE87s&{0&j?+7Y0~+VW7RcveEWL
zXPJT&M7uH>CYHPzMU6?{8<m~|%UZ0fos+cO`WUS>67LP`jXs0kMBkxrLql&gCK?bL
z6Z&C<dOr+Te-DGy-o}ve?_$V=cQJfoUyRZjh!I--F-C7Fv@FI$*GdPLE_SeVc7g@f
zk~!}wnc-o<`%N+jBuQRbwL^eQu=IpH?g;^(47y23s+EN@QcN{yYKBQh#?a8wC787^
zW`a6KjT=v(O;FC&)HReTVdFHll}&XtWJ^;$eN2+=OZi#Z2*-?#ri{k+{0(y|R8xi}
z)vvV-$T+zX;7%~Mal#}ETc}MkROYC@PNn%uzd@|5L(!|xAoO}yriqQlJEL?7Jrle+
z!4$7fFvA-fY?O6fF+j&1eJ6RMuhA5|YwC}8ErZeDDF&maWMV>40Rfkbu|C-t?Vf?5
z_6Znd6@wu*u{@8%xXIa=7(5;0{4*J87$1-f>-0H@U4H<9i#8#V@SAh;bEItC5A&#W
z7zf3|DJ}z1GZ(>nY8W)EEMY+9W#C`~Q`r}Vz_amkf!ox{@Qn;&Oodm7FQ0RQsk04i
z2{s=-mq6gnU9%LKv!)?1HdFzgw&^5Z2Ll+|7-O6+mEpuO7*B<$$LEa*P+g-*&@s?q
z`Op@#(Nk=$<nQv@%Dx+xypB5h78o&Z5*52T-g-GjO?Fg!eV~9(Pl4ZS1NHId5F@-j
zoT_iQ1>PEIgSSUJ;+-)r1fM6~9WR3$f#|0rqqz}e0&OnU-e}uQs=aAcdo!WoHV?f<
zX*y!qI1Bn}9m-G_1ct=GXG$QIoNAClCJ@VHXld=m=15raIjxrq0(f|MDEW2*xgJaV
z|5AEleI?&6D>DPra<foWFb#8N&&0w7^RaBnQmk3I8e7(H#?CD}acJKW?Avn?2M!&@
z>5o6bkq=JdgO5(*!o>^Ndh{@yr{!VLWN*A<Zi1mKI1N897^TF(Y(WuWw+vQ?4q(#O
zZLr_551#9`AdKqKX7Xe>(~5-nxMAyxW$0{fM&7KsYz19lkQWR6rD+&EJrE=2PKD<3
zU}!Dzhuh8!80NSleqJ!X_-YG2I8cal`wMV(dlpWuj>U!D+4$jT0nV%m#wV+M@y#Yb
zTv+XhV>2}|!@C#uWLx3u^-=hK^JIKH&m2c+jgO@m<7Ad5Zf|u&=@B;^o~DLfVOluu
zM;mNrkF$n;_=2s}DMD-?!M4pP2FnbRvCuLe3ur?#tsM}k>5UlU6y(}ZN2+-mqS>;=
zm_;GME(S9KauH?h3ts}ybDS;Qb=_cL=nZuvZ}c}J>>L8omx{hWt=ec-U<?0Xm^iw_
zQrRHLl?|Gxd+E=0Q2KSH?@d=<1A|8OMW5ksV}RPb7&^W;hK_p^MmB2Dvl@wE>Tg0#
zuMgCX`(muYyBMbRI)+Sq6>3KBVuE>pj5d6i(FdcA-p5d#K6r1;yBMr76k`p?W0H+N
zOxX}wI@3lvTTw-^Vi2-6w2QVZTS=0aJt36fy{F{9{R4m!{{*7|Q|7Ol({h`bo6~Na
zDq9`O(b&uk#^x4Eh>{b)$$KT)T0%Z6BhCUmOItf-moZUxR)n36iwA9+s{%V+OWAnH
z0{zENQkJ_Kp>NLr%?d+wOweZ>E!s#;hL$oV>}>+?ZA~k@u4#>4<IT})v<Y4rZi3!w
zHs~{vjkFnUt6d}pImV)|Lo9}PW?-0i4u*PUW0dzajPosons+`%xMpIYZ6e+_4P*Hw
zppSD5-gAzGR^&9;W-UVS((Rag`YSB@^gAp$cL{61zJaBmeuF7R3t=9S0R1UJaE(b)
zru2<BG3Ir*fR2?Z%w5^&un{tLw1T0%1=XGvEZyzl?C%MeKp(gfei4~T$X_}SbJi`#
zjFk%&wKpXq0FFK`RD@atpDwIjt)XqA4K=MX7&&1C75`YOaBad*j}4J7bPcHhX?0~^
zYUw%?Q6+uEGSAq;+ED@Dura!LR|Yi*zBl@*<F&r3UDaOeufx!VY%=uLP$Si}wc+N<
zAjdldU+*!lcz2u^)t;|1s3B=A0}W|&U#h(^b~za5ScnNOv(amujx)w++9-=SYEpVn
z35Y~oLK<XAXw?rA=t&+P6#!d&xv|W6!^o7he*#YwoQ%}Uh^?H9s_W^?lXIC-k&uvp
z%*;#_Oq-55(`RDQjCoi&e<{{4U5!nv)??eoZP>MAH}>s2fP)7Q;mDDr_~fHcaQ5su
ze0=sZod5PKtlhH>rv6^&ZNZz%%>`qlqhL@}2m>lP{SE72ymuF@jva;J_8qX^x*g7|
z*1>Jo0vPxO!*OajCWl2Mzo-Z;wWT=o(Qz32`9eE62@zj?42QD^pm%U31}}=mfIK^l
z&N9QqRBIF*n1hPWn|SuD0lz=*#EbT7G~GOlr`6w}?cO=mTtA88%R6xGt0ma8$rE$)
zM`8a4BkWyfgmastaA<}f76<9#NWK?N7W!d-t`!c=pMdWUnxOh~cYL|X45vI?amF+X
zpIBz%lu0~3whqD}V^{1lpNu62e#p=YMutfk@*Uh!=<bIw!w5uLWFy%*4^igP$Z?9p
zELxx#4SPgs*ui&<1>D8b>UqM#XfmydFGkw<qmR8G-gWasA6GXFaib;inL?Wr03!=l
zcHVeSRQYlfN`kGb@0Qk0Ukf8f4M4xaZ(-2bJ{U7;00ximMbPzvnbQbZxQ>R7Wnbvo
z^@oo2d(gJ&11-i_lh-ib;!SAU^ycv`OtgCslUxR3qJ4j8*baf3`9KUMC<jb@3j;OY
z#t5B$m|!#lI_BeHVyjDgWI$zL!h6nw_n<W$he~xyR)I9cHoPa5w3wHn<PeGr;dxI3
zEz$Gb-|tb~$Ew9PRDIPw?~w*wfgj%^{g9&a#L}8e|C6O1CK{SyjIIe@rxoin(E_8a
zJfZIFk8uuDFv`XgBdk3z#(pwJ+WKIm-4u-A^&V>Hj{!Ek?kvkUO}+7!i6`FBvO#ZM
zJG^h?4lTbFSSHScMbcu7@y(&lO(*nnF^JF`<e7tEzSA*k>I|qyFN9X&QfMSBf_mb7
zXe7?T1X^LO*aA$9$cB0945aNmjujWKAaVOaBy8P>%)N)<n4OPt9zM|angaJ&!jINm
z)7%u3tj(ZrYfiX%D+<ra&r?x$7H)*S3$KG08$Le|xQF^9BrP5}i{@bN#+3x$GQ{Vm
z(ArL>(sLs8CSsELB$%?SES$`tYpM+m{RvQ~>YJ#q34PjPLo)-Vf6u_wfPgb+S&}DY
zi-;;^aE#7ER+G`1WR9Vub<k(v1cGlo-sn3XulJn*WlPC^w7vth@z$V8cyq7{ezH)(
zJ0o~4M>*nMH8=DbPn#=A%-RH>UML2O+A~kWNUIDbS$VwE4D_04;08@SXAD;}q6IL5
zyLT}1rq6~OFOsU|6xa#;B)xz)4S%4Kv5BGrWfc@rZxRB$9>9sJ>w%se<Kp6wl9HkT
zPZEr0&zOZJb7*B3tiqNR8?kf!HtgQI6MJZ74;(s(Bgc>71eM*nbLVjG)3f-LarT>U
zFmwHCjAMcHu`^}CN5g!<JeaRt1D!Q%q07(f?LGkQ?R%iJaVy5po(B_#aY{P$rUt`4
zA_{i?K?q4q$G2Cmp!DwdNY0IgR<JKj7R^FN)iv0CaR?I+u7KUiWiVY52;(d(<m{S;
z+nv|(?6+3n&u94U`7?Ak)Z%{C4d7uF{^+X2<EHDl@X1=NTo#DTEG;ZqVuGXl-LZGA
zJyv9xV_B><mPT6O@T?$g&2h)ZJac@w-5R$)os3%tyl^Vf9OvDm@v(6Rjv1xnxK#>{
z+6H1jE6qY}S1hE0E40c+x<x$Z_=Y3XWip~HqLAudfJnPIO!v#cqJR{{YFZ*>s4jv=
z7$T63nY*zMtgNRJKHeB=F&RT#rm~^%#oJUYgFM}#=HUjHNH!X5`E6*|oT&0-OFvOL
z)=o~aB4G54^e}wnU`9V^Xpg{H?Lp`_suzZgV-S3LRs&#QGZ0$lqVnFt(22b;O0O3z
zyoO_v13~EA4~A|7q2ur#bR7Fa*Qp=W2*NRz??BDE560QNkMY+1Fy3+ie`i08))PC-
z_s|ln>k)tkV_{%C5vFGP{B0(%wvr9rEos4}Ar=*=YLHcKMD9}pOTeY9ULwz{x@LNw
zmr#`A%aG6Xe6A-1esb^29w7ofBVH3tePfJN*T(SivWz-yzJ)vb8@u2=eFwa6;D7-p
zRBRR=7-H>>;Q~7cf7(z#40D=_F`iKv@0&=)mIO7wWK8f+f{tIbGL6hIBniGVRw8cQ
zAp|Yk3+<Sh80?dV{+?;*=b4Uyw6Q}1reSzU0Vc#RfL``07*1OY$3;8gxL_;Ha+g3Y
zC<Sktx#M*sM@$TggJ<Cac+OzVoDaLqTnuC*HbS~8SWn$4KJ@tiny^mG2&|o#D_jXZ
z2f7~%7ykdAj<EKyr`ofGjgK=zQe%<3a27J>%|KLcI)VtlfW#;y=cmIvcrxrf?J?14
zA`Gn!*eK}1E6@Y>9@5{cPuNl28EZ3i6zCaS7!i8<(9zR^B^9Ovt*#@>QI@|~fzO%B
z+@ANDDf$mpr{Wujw`Gp5GEY~vK!LI{%Ycc>)U~$<Pr^F{UvFC7-ZHW~#)&H11Mg`}
z#(NV9KHaHQdl49B8jq2dX&7yj&BkdudTHu85qgp{V@vrdBT`eSgyth6CZ2`QAHthS
za;QY@N&lV|?SP4iDSUi<{#y2_2XZ~<VsQn6@|>tXv9uW(87kn-o{fd`7h)CF-TD<O
zb+=p6PY&$I{-acPr;p)-Pfp>ZPtM@$ufD=p7r(`+&pt=yl4TfAJFA`$1>M=xVZLPp
z40rE<_QCxyJpKV}4t@yhEr(&SXakI9ErVI!3|JC)E=dV+i3vkMZZga#J7fQ${b;JK
zz`ot9;TGn?meU_j+ZQ3}+G%)vwh5kJY=zt5Ik3ug#)5tMIQr#o+^zW@O%2zm4zHoE
z@;XXye2L~;pQGdEIdtB+fZ_`Wv36EC^5QL#9jA{i%O~UD27jz6v_jrgbu3G<!uot$
z9Hjl-w7>;(GPUu+b{~APF$CvQLvY+Z5T8s*$H(3oI6_G6HuJ+yyI8Cx{ATDTp+GMe
z(+rA`VvvGV#{i^x`Xa_=Dzd$jQ9#&bItL;~%MziZ^bk$^6shkDPYc4_CIF-D0x*D|
zdEe0&eH^{<K9$@+XJ@E;d%)1g8>Z4pFQcaxws7W6CmX%U1bB022TU|!<<?NcsIkML
zHcmEO@6X1O)>iFp3{!g(BgenZduIT~>%50i8gCI)1lgo_pkpx*+Li-h=B5S%$6?U0
z=nDhq5is){3mv<`RDFFg((FwPH+!AOH=t(O8|pUwp>015lk7)9-%bsNcH^OMF%Ehr
z<Dg6MY3q)K=0r7UYts(vYQlh4$;8x<P&cRjw&A@=z!6qGp<35XLLRHqt|U}*U+G@{
z@m|&Y`Md<rRf^A6StegYM-QXMYtpW1DZm?{u0t?W?OC{Cu!$@B=-A+m@y2*%j6Qme
zF~DmROz?)L1^N(%efj!ud#XUMNR0Q6#(1At7)Ip6BWpgQW-dWQ;XF(&n2V74D-gYI
zCqmZkhU46|Fw9#FgWRRi&0GYHq&XNLHv{9N3!%y5q+~Yaw6gZI*1>(jCIqe61z&!~
zBr*e5v6+aVkceBh9-#}C!7Yc@*ux7W2$XRw)5(bmm>d<Ml&ys`RkN2HJp5^2{k&*R
z33nNAa<EjO7Z@9Y*z9CXBk*ReUV>?h=OTW3Ho`KK6{{N*PZyUE21jp47+C4U*3B9&
zlbzt^>w=)D0NBy$8dCkK>x_ezo+cX)HdanHFtd_eOhVF7M`^$u9at8&*2+v3D{EWG
za$2Sq4j89lfWad)@s6y@^4>_i`Ti)vZwy|4UqVgk_j`AU4&EX7-W@KRh*R;6vQu*S
z-dFd+yOPSH<%j-y!5C~5P4Fcs;8U~BL9a<BZkS~3#vkU$R$WHZCL<?*4&sy3mDa8Y
zdeXAW&0%I{4r60e#m4q1IXMdK<S4M~fuC4f0iGzlY15`D3j-`%xDd;hEyKnQo3UjJ
z?d<N|IB@tN4xc=NqaPA@pPs_UUwn#-m%hcBFTTJ5s=VmA^D!wV7Pbo(z+u~FnC;mP
z{X=^(jz`PSK7#wX^YGkz3=WId!(;AhIA_g-YjTz%<s);l5H~Fy7T$Kqo}GaQ-L<%L
z?Hsam{o$Qp57V5<uw0S=*A3YSKC%$8XE$K#&gnQ``7wUD_Yq1ezsLRV8r*L!MOWiJ
zl-~RrcQ2pE?F%PS_Vr18dukK*t<J;%!azO0+%z{#i?+a_HHr9eTN)0ooQe$pF~|;>
zgq7KzSe|Q(&GS95ex5h-A|_!?nj?0^dg5e21P(if;Ipty9PtUmF&|$Xo*aU$o>^FE
zlaEa8Jml!lLB8R9WSV3m(>)kjzEd#Gdn)qW0+4Luh8Pn&B$_!RjS3^m$`{T?9xx{G
z#t?Y#JNl!yE5n61fs-qSxe$1?(x#JrXklC+8~-@cS~%I*!_k(ih*r-pEEu~_9>&J~
zJF#~A2CUn%2Aj65#KuiauwnB;Y}m38>o+gL#;r@SYu{Sz*h|H}YbmzwU53rO7h}hv
z71*?QF?JnWg{_B{V&j2@*mZK15*rUL!uo^rvHH*~tUOeNRflF^&EeVDaBKlKA76s4
z$CqQ<(G}QvYz@DMv40JA?&aUuwHDiU@VH|gcJJPdt=l$W%hnCpyk$Lu=UX;n`}VDT
zZ725a+k^f4_cHb|_Ve*R?A^Op2|1VN<TZJpD&%?9YkT;f-8|pCZ5!5Y+Jfb4)??}F
z4Op>ZE0(R_f<<dKVd47iSibuh7Sl>D+;#+WHtfgDwR=#shOurRW^6owg7y1RxalBf
z?>vb`2R_Dv{Su#G?WwPE;L960d+i=RzH|)-zxW2*&t1gUuP$Tbx7V@e+nZSZ-CeBt
zz65J8m0{hba;%s5wiIhF-o=W~Z(!NEt62HvO)Niu73;pZh8<ts!v1e=;gdVnxZK!<
zFUsn$`Qvj~cj_Z7+`Au(_w2<g{!a%#K7&0J7VEZc!uI{Uu=C&^>^{5?I}Yr|mOb0B
zW$!lZJ-#2uJ~^qx@w2CK^7D`I;l;B!^Y!OA_0@Tt{PHYLvVQJ6c@W1wI*Of#wqwIi
zmdT+VICx?o4u5a}XU?6%-orbwar;`VSi2Of)~~>}U0bl1e{aX`?bx_^9oDYnJ+g5P
zwzBNDZQaE36CP~Erp?>1VbgZ3Ubhtsm##(8oMo8K1|omfQcR<mnKo-V@))@@B~~JD
z&IY7Rr`xj%rp2|!TeQ0GQ1Qv~RDCpk&`&o21NFl&gf@4Cc{0XWXP}png(oGa2W^B4
zCK*a4@J32TA<{CZLFN$JD4ocXM5<~%1$Y(~RtO1E&5`|oK~G*2TPyQs1$Z-O&QyT6
zVufn@*v4%eu}v=Wkv%wa@(@1w@F+h1^fb<2IEODUeS?oLe}@B?zJpa>7EI^Pg3abl
zu-LO3W(N<!_{4GO5s;qWe}Uk$XAr;(X<sxAzJ>V+&d5SoQaU1&GLV#=kMx2<_=g1{
zH7OD`rI*lLcNyz8q#=nGGk&fuyk|JVKGzA>@wNyk48iOpD{%b!hd6y<KW+6seDlS}
zxXs^ki@&M-##LPU^i#Byl;Eca-FWf18O;q}AtX=}bMxJh8DW4O^FwiFV>0H(>!2_~
z7t_OC5NN55tf`jRH7^0%r>7#DE$^O?cx?Cb$3}Nse3~4J(;-3F?e2zU_Qu#96^7-L
zlTm1&gLKnE<XJ7objt<EwNA(MDFK)h>WO?OD<taaB7C9_;w>GK?9FRGB^K5$VVGbW
zfRRqY=u3sy+ldO#Z8G|~xzpY{Let$D7CxS^_hjpCOABYg7TVqsP6VFJpOyV?qf--5
z)>4P!#wrxom7~0_3{|zosI0z?s_Gl4sk_7TyNr9NZ!AY`Ln&(NOHk8Tf=a$t-EbGB
zH8)VvcpIhlH&EVq6SW=pP~LnK_v)^oy!9rETd(78>s8!oxr)1t;?^4|k!WT#-ax4Y
zKU-0Mi{Htpy@iUZoBaE?P%VGEsst64B`B{bMp?Nk%FFq<yi|#biZWDIR`9*mN>o)<
z@o}YcR6SS4^9nv!{xamef}g8EX;~R=-@AvKcZ+fD_C4IVSB`sC^|)P8hb#9gaQRL-
zez;qS>m{|gSz3=<jO%x6arJf$eqda>Q-kmC*5SK*^|(~pgsW9;xKh!IAIh3>r}jST
zI)6m_;}>Y|euT2dHe4;Q!}kQ$cNI;zRNaBA4Uh0$RTqA!dxUFE&vCQ;XWZ)i1-I`1
zhU@o##nt=2;D?T%aH;h<E;T;H?e=G=dHfsNfBr98fBXY?>f3O=x)E0^<u{b#^1Tw=
zDlbQAbtOv5icws87x#F*uHL<YOE<6JM#*iI*H@ytxds&td{0d&N^8q-ueua>s)})^
zx<t7b%3JGsU(}<xzKZ|%EtIhQ?^WK$ow6G!=CPFJQc`sX<-Avl%WvVvogZ-d+PC=O
z`uDt+Ebo##xKn%^H*a6Z)ob75@(<tO^5t)F{rVN$ym_6+>$rC924B00Yd7xT`yX!N
zt8cI3^NT;=?1jtt^o#G6`0PtQzIX*6eSR6Ae|Hb3KfeWs$w}xv!W?f6w^WjsC5Nx~
z1P{EgH3j{20tvow3^k6$D2o*IGPm)jjQ7GMBR48NH<%EDaY=bdBQWe}ePkDPnJVNU
z{d%+oh6cuPa&#r&RA3Xp{ZrlbsJ|XtE32Xu6%}Fr{P~K)Teof<HnLUTx_1lqpV*5N
zY%M-yt8(_-N4WU)m-y-u|FO%Lu<Fbys<3o8ZCek=WBXx$`UH%R9ERhWPho%RG`!A#
z2)o_uVK=V;)+x~lnvsFftT-g4#Um{-4!MMFdP)j>rubm8w;Oh>T!gObn`mpef(=`9
zuw+XlHlNDDzAu-vWln{6s1=Ilr=XlR_SgSB#1B`_;@QJy;MZS(Uw#Mv{1f=?7qr~F
z4gCHS{`|2UKRvC%j;$G(TjYvmGkvgYc@z$>iN}#u(I|}8Lsp;x0!$6yG*%l0zCl<N
z6oKXb5!f4%f{&szanR2X>%8@`(N`b4r?_B$U@*3a#bRS*CJJ0)kY%5UEQc%<dKRI`
zD-VUVNHYRlFw@^0iH745tE+){D(@t(0Qmbw!N4sTL!73fzefb#a}8lkMPFB640QFv
zFn3o>@Nj`KuZ2CWt*mP6Kr3NyMbOd8NvE=d6IH!~H3B1pQQlO;3Rz8XRilQLp|-IS
zwe_W_t1m`FLm6rb!<t$Gua0101*xeQ_*L-8bG}~1_f-o78;eoXQi_HSzOR{JY9tif
z2qi{2)pJ=(i4x_lgcze--qS=y-EaqGjEZ^!Rv=q@n~=Rp*xewU2tiiH%BoUAung5T
z@>?o+PCyfQB_;PzR>tqEC|3YiTU(2|x;iClYHF0{YHC#A`)5>DRr2%YC}yK}r??om
z2#f1?igB~JLV@0`@>;@<z`IjMz?I<Uy)sthDwOj1GD54E5WIb_ngFcAjnX<?<>PDR
z{9IKNZdEkmR#^j~(8BA}iRQNZXl(94MRgPI5_)&}`t9;Y0;qvNX~x~kcEYC{#f;lk
z9k^cEhMRSrxYhgsw_6|MPU};Ycm0gAj%TQQ_!GK*`Zu~?{0mLpPjS19Zie4^yR;fN
z?h@d|YyfKb8~MN8xpND5@7>~W=aJy6sH;LvQyr>U&ZUH1aV3Gpkn?huaYa2F0hV(q
zk8(eh5P0`^&F@yQVXL}_iW*+$8os~sjsm*d#n*7{)+Jt->%4~7aqG^tmm%jj2s~9>
zCv2|}d_P?M9@nn_pg`~D%^SFJgRkGbjq5jVDe${|<tD!U{u;jg`Ujl<@_U@S@SSoz
z|J5aY@&(Voxrwvi+=ZQ2tkS>t_Hc7rT*==f_{O{7T@6p#+$rd<8-&3I5g1__FYD>~
zQr-E$&{PzkCrmB95fz_<l*~LOF;kSDOeGSfr$A5N2vdCg2)e)3T@UDbz}J(SqKwoQ
z6kzV$xmdDf3D&GxLkYVDJ9qEEz9YMF{KG>y^XUipgckMU#jo(q<*WGM%Wtsq^ci?8
zSp=6I8)0>17i>Q{4zm-7;QZlf_<#5*0`?z3*yiQ%DoBJ!N+=R$W+Egl0twl%RF*NA
zmJx^8AaBG6Psa4L7#!QZ5p}mNqvrNyG_Xawe)A&EetQC+d~*cHPH*S$S%G86wxN{@
z1%Et1J1bdRYZdU{KY$lM<6qqm@cWbd__4blRo6bjk4@j9@`pXRczh-ftPH~ZWK*ol
z@x-akStv>|MQ(x-;sXs4=WPpLZA(OIPsUu^Y^-(5#qpRdoa65+U73mxldZ5LXd({B
zJ2431z)UP~oR1>Ag~+l^L7sawW(7oIHtpt&IDgEDb-^@0ZKT+bLW;FMt*$##{DU!h
zN(i()r(%HnRJ={a_x6-Xyh|HA&^-{tT_-DPDe7*nF!1q!xtBYvot<H8X$MCudpOwH
zvr(g(ceI9$vkmmk^x^8`ib~$x6?K(75)KW7L<6C~N>I(4TnVbb9>~=;5IO=oMtxHS
z>YDk!1{L(G8t<XHsTj2cTMeVCnH7alt85~ms1kdkf{!csIprv?HLx;C)UjgL-c_Qq
zMil04Dm(&^&?_savM9ZygeX3NUQG=vF+-*J$`s|+13rl!)u&Q>e}_D$0FSV{eTRzV
z?mYrb1wB!Ew*(+1mAHJnl+a_`y-U@17dMIt$kGZFSJW^F8UpY--+QI3iYl)XKU7px
zaq+o|THGkF;p;UhspNe`5LOU$ckWf;`t1sSe=VU(#mM8$@&+~#52ysG2p|58`nJb}
zP#f-03Er+GKr5SZv!Vs}YCEaIT2a;d0QZ0T9glwg18onVvcYLVDSu08DgPdS?`;CQ
zqOuz0gnBU>gW?J{F11v>4K*yQdMb1RQ9xG4YgxgvA>=9uy=sE4nhLO-jY<hsp8`KN
zE~WBV;8=beWiP>3M(CAR+{V4q8@P7!d)&Tzjg7!9mJu5uMln_1?b|mMuw5tMt`Kxr
zZd@k#E)#5*aOKKnD!pq8^i+zEaa~m34JyKG`1;$+xNz}1e172@CC-x*=P!Ow@G&l2
zqjQNwpW%jhbBH0{8fk&I$JpZSagOM%E{l<Rp|7?-2I_@im~k|E**mMCXKd~bBXb{^
z5PIP;nMleY^r-ZdoEm3)rL$O|XP{?@K>r{`>Ggo_e?w1Hp4i&l+}!_RYh^mw&Ye4P
z;J`i{J$?|UK0K}rSbX;R7x?1)t2lG%23CCd86vmsgztg<a6NtqW{3B{oDq2T6HGmR
z9Nud-V(NlLNS!+!Nm=|m(-M)9mqg#6iR_$s<Pm%+v3|6h@mM`C3#a$2LB*BN@oRS@
zp3t&2u`>Me%TIXl`~hY8P29b439U6{xcuE0Xkx{BNF{RV{4tc3TtN*hRV!`IW7?t)
zTB-Wt%eeFX30(PnGp?OqiG|rVi1QkT-HXC-Vc!yLpO=FCczeuBn~YFLdnCJrAju>Y
z%ROgcv*#Qf@Gr#IxzlhVHwXt~CgDJ=4t9hZ;c#*Q)`x^+_0$5|*!jpXnt^=hcr0V%
zu_-$gOVb<)Q)A@v{V6_bh;!0J>SP~8O$mdCZx{?F2V*FsH{tgdANQUTiuc|8F^not
z&BY57oZX;HK$?1a!Q9mi<~H`Uw61U<_>{S#&N7Y43I<jNP}3X@Cs#Yv6LfV1UQHdL
zAi)Zvz>grT=FMMOOV!s@sYGLIjS{tu6+Bj>zNM1tp$v5`<x12vmk@fTth@vy)m<e)
zSVOSY5RldUOcg^QSk=tO@>w=G6^&FURE(8WmlbtXjI{*j%cx{U6Qx<ra_h0UqWY?9
z2;90FD$zOuu1C%N4LMPHJ@L=SaxT`ljJ8&wcb7mD=-n-*y(=jv3~2KSsyhVNbt=2-
zrB%3ET7oN7l-H>WZ<W&i^0iW0%Ho<PTG(dV%LZJlX~0#+_1Z?<tf7*lTD;5eDXySZ
zzE^|0r46`M%<mL+NI>17Lc7J+O6pn&yeD|{;t#y|?Z3GH_-9_{E`pyht8PXyfq9SM
zE3R%KAgM6xn^E1~N%*y)s-=zUtceX#E9&c8QC-!@>%ypSWEnK0vbh1Z9nGj~BXn7=
zVsj+~dKGd#c^%7nO)IK+oe8>1K3_%13H)kU{*^5MQocu^SSo6dQN}1&g6f<gynF9D
z?iJrq8YM-ISCuQu`|jO4gdVLc;dbT56<ob_m4Lj4%U3T`@d@;<s-Vc%CFH#~ZrsGx
ztJiVq@)dmZ?Im3N>RWtq;VYcKa1j^z_}s-yw7XZ}<P(K=hfcy9gZ1#{a1*>a$_j6(
z*-`D$=Bm5ny@@```g=nR!qLmh&P%bhMrK|xwvdc`Z-hl>ATd4n|3FXH1k;~nfS^Fh
z&HJCw>p6<$O-!UM%gVy^>C+XJC$@I&+O^oab*s`@eC+5Ee0t_Xocr`Md?CcXd;`ZX
z-@=N|zeViP58$+8H>~&W!=#NHV7_S+qCWTlX@`#@aqR|pB%~l~<{V_CB%vrf4b!qx
zFry$Fd6`s}$x+Cr;#;{iAKTW?#=#wnaq+`l=&ZVq=WW&a?S32Z{73xp=rR8F=P&p#
zXj}jI2|rQAUA}$^7cQN{H<wQ0>#q;t)T!+_c3=~}K7A0y7d}Jz#jjC!?HZb^DdX>a
zi7gun;A}Srjy9uk>hKzDS~?f0LBUuvEg#F`^O0edfhC?Ru)=jAb_UMJzVK8WO9;nj
zdBHf7<$=SAcGwYNkG+Wz*c_dN&HT(PuLLae&BN-DY%B}%!_FLk?3w9~C8=5{iXDlp
z@Ue*Wo`mSYDe&@+g0X81#(0KdSYR{;1Vy6v<N);bka?Vb80PGO5st1<Bk;65ykO|b
z2F=Y2W)4((wk~jU@Swf5hc)4B>TC^NJ0pzI8;kzK-iM>36`ENgngt-Vgtc{4NDY5S
z4FM>suc1X{dmCG8m5_6JPGD9?c-0eLjqTNFXruaSAq3khXmbflg0HcyoSz{G3B$VP
zQU#3FJl3`nlx>wNl}F%JQazT7g`$P7lwc(+Qy?esBLw+aDwZmEUP*XXso+;%Uq=8o
z@K{d>inZl$XQ<Bk+o}BI*rW1F2(X?I=-r{xlYYHh#VV+YO1nzcca?Vc3SoGa3hxF1
zc$>hwBleamtfZ!$5b7f2+Hj+`jo@p=HG=AfD710{Q9wrvd!@JzH>z53i|@O|_uQ&!
z!7Z`61ZO#c)&Afqo>95|^y|OT!Sf1&@NQKDZu9k<1m``zSKxTBp&6xZRDsQnxKdq#
zyNwN~YHuf8Yk1unQP<dxCog_O7w`MFAD^O>z`NUAhkG@I7vEn|%Laz3ul(h873fJ<
zK?Rju6|ZkC8@LAA+9rOcPRgAPi}D`n)=(N9mEtZfyRAgo%cvCPO?XzbOsWaJ@-kXy
zUejBI*tKg{si3Liq;KyQ?J&W2opJ5PRYFc+M~i!dc36&7d)KdCBlxc2haaxs+i$-k
z^uEN|^XGByi!X8R%WswbJqNEayfHunZw=MQ8^et7+6Ys;KFShriq(B-bKjph1p_Aq
zp_iGJ8-eFemFEpJYd@IVPC-O$7828PRPzoUWU&D|#ojtOIV#Yb8rZXfN{_u2g{J~w
zOiVP=(o(4Ma+C!>7cX9d)vMQH)21!hwQDyHA32OuADqP5k3PZqa~E*_5-sgD0`L4q
zMD0BY$4y&dy=@27<}HB1vXuy=>Wf;t5xJ|^DU;!&3A@y^L@X%C#gd|2ES{B*{LDmT
zCPZL<Q6{#opO1}eW?=u$#n`=R4jPNU#*b7OKeF=ucE1(>>g)uGhyVWjcl<%{fyaOU
z`8$66*RN=M-i0T>x8dRQT6A?+qmz~Qw}-9xrM(&d>3$5}h4}r~7wCM{iq#vIK+i}U
zAz=|XarhG|u_Z|N&O^Fo92UDw!<Na5u+3*4b_Et-e`GRtML1(;v>6UiwH-)x$G$jU
z?2nJe_PBJcjm*GWUh`EkDOeaZ6-Ay_*qP4v<U3$}x(=oX55TmDQAmx_$K(JfOo>f~
zmP-`Id8cCN)MWIX8iU?`LFnT(8N)ohF^-|;<bu(*_87-9H1M8GTRR1&j_xd9+3?4M
zuy%sItpjXa`MSLY#+qnip!#6E)$cV}+8CjUH+V}!J(?RE(a4HaOW4&m5)^XeYxT`F
zXlf%oXmRVBsR|jQ+8Wwa&=cU*w^pLLvqp)!R@&c}eS1xv6=?0QKucFSnmTBS+Xy>a
zUr~cS(5s^5t)_~U@9eQz5|y;DO66o_lTg7=1-)v%R!3MjHr6Zf6X><JwlG==zsA3y
zS64?%_fq9ahy@nViOQ2>Svi$wRSi`Pfkw4=yM*?&vW_4jsAzGo5qj5XU2l|C;|}fY
z9V)cjCDnvqE$&sdpoGvXX?jQiJ;1GoZo;nvH>(IDMsZaezlR{KY@q_|z;#0J8Wr2E
zx;EUdZN;4$LX0r0BCOi(KSkH$7rb`egj@ryEQ2cY7LT{&x6o4Gtf7r1^lmrR<8nnQ
zZr1X5H#O0QHW1V`RA$Y1@vr~j$-n-<lmGsSx@Qkj-qnf<zQ2MEgFsJIe7!)8Hn*m(
zN@+->zgD7-*Sx-gjTX;E0m^lj=Tss6sB*n%Z3()PQrcMoUgbSrZ+?!du$GV%=vB~R
z72mt7fJ~rw``&F_xp@^o(B|HF+0S?NI@RA*RsY^CB?Nlc2|!g}pV;0@m#FrnD~#|v
z`#JeR%X{J5>#+9-#w+j19w_>Fb<iZdHk9BSZiY8T*+BN(cxRjo-qY|xe{DbXGO;A|
zMCn<0!@|xVCRW~vNz6k+N)A;Xm7asW0zAov7wGBe>LMsO7@^?=Uqq;~AZS>)%(Yd`
zu}w^fK~`o83JYY#l$lt*Y&AA+*iMzFS_<mu$-}g?r*Yx@7nG%6<J7GiSn~O&2-~p*
z-W%7$eBOMFPtStYyoHF{wHHA%=OHvb8_DUJh>MRx?(9s=S(cCG%c+bC<B^s;6|-lg
zVE)WhESr~s&8v&BeeFzaT~&Zv-<(26#SeJaT#VlzHR9I?&G_%LM}*!Vz<>UYKmJYl
z0r;IqK7*hC`)~aE-#_sCfBwK9fBXx|i#+fN!jT{OQ@Hm6_g_52{8g)<YwU`|*v*(0
z`vD>?)*#z>HC7od!%2@tIOq|GlTmK?tiT6niX8CiOe=gg*A*Wxiov$zFsuuR#DTc!
z*c6eD%?W8(n-GN!>A{#EW{DN?CRmrKgDq*gD3Tr6ri?&Rh#_V#%YkQZCWcH7!I-Hj
z7&SEwLjvP5z&`?m{ev;gmzI_`c8HxVM%dUulS<u~D$kNC&&pM`7m7^amlPENo=i2A
zBxFrvJq(;M1aA#{6R!>I1q)XbR5j85HP)e?0BLNlM+=o)Ya64rj^|QActaCNbxI#z
z1!`YLjr8Y<auYk-PN;QK33gD`wU;V@Yw95Q+6lO}d#IQ5PQr=rtE2L(ZRK|mlr_zj
zd>_?WbCsg*%2~0hs47HVs0L7YE`3hb1fSR{Ryt7@Qo-t|T570ZMD0m5H#gzYqld~M
zMo+(<612V&GJqii9{29i&eGPFmdW??Z#OidxV(xguZ;Guny`}ox@z1K&{6eCAD;|x
zhzh(*#doia!S~;-Bm`+G%bFjcyp77L^%-v0KfsOZ4nnO1ck9}fez@C&oy5&*Rfyud
zUCY<%I&iO^K?oMtx1pp#lw&LI5_WfJoo`X?-Bf_b-@xBg*3^d5#uh%VqSYn5YwA(T
zsNr*W%V^oFYti}mF&_W=Guod&K{bI_-qOG_Xhm~JD_T0+(Aw3;V=I++0~;1z4_<4z
z=F-^I@jeg*FO8cL^;FW*wZUTzkCj!t{*_dB71Doqi`VZa;dg@#l2}{mXH_*w6$El=
zsa)e*1Q=~BRi4BRD!uE3pPb*4$5eXKzt^MqZcy<F@Onb}`YzMze*MjNIRC|0_?-6k
z?3b6};1Q(c@4Y=pTiIRhjUh&OW2mWWJ-$)4%82g!6I{?w(*wPX%$;FqPViZJ!Q9Rl
z#+IIlPRJ$nvI#s{^+J+5ZInd^WWJoXjt-^<PbKt%2)&S((2G)_C)PG8F$USRoU>;Y
zDsyVrtd{lfc4OzxgE(~LI8J_e3?HBUkP`JGKKuSMwtoF3qIYb9_qvr(PfvwGb}n3I
z&qw&GbqM7#EHw+6S-F^zmy4MNnV7$_5c5{$BWHRPrsp%VBd~BrG8WHF#=6D1gk2$a
zuAPAo_O3+nr89U?djl_MtA6gP$8V3?f!|&z^!k%PqjON0CdXg-_%}Hx-vofa5OBZn
z<X1Xix?zYwd_rE<*ZhKyPu;|V89R_Xbukj1H(-X>LCkmCf}Orw@v+}}oc2h^<(x=-
zH`@~zS9s&|RUSA!(-NOA3&OsFIBZNz!ST%b*cwcg7aWCMX>r(|9f%!y?%0%RhE?$z
zSQDp?#o-f>=cj?fL=OZ<O@Vz(Bt`{CVq|P4hJ}jCOT-}G2n_U@iosNJ!#uq(!qpig
z*+8kYfim{=hK=_WSU9;-#oIyFvU9X2>}+Kb2}dRQ+|b+{qb80=pONq3^}(;;6++L<
z%@`F_ZdFYL1r<R<Q!U{k04YZcLDbCGS_!4*cG_C8v3#w*MbsSu*GXI4O<PPw*K)ra
z?GI}hJU*zTauJ1Bj^?g1)V3F+u9GUSi|Vk8s-c|_CG=|eJp#QNsyt=Hm?40zr*aTw
zQ7Z!|Qt5b<kaK}w4J%h2ZCQ`9=&`xYw0#0SvA8|p6LnWdMf6YTiNd=}@Co#)Xn!jS
zzI%j*s60_jCDru`^lp|={gsKG6@@2*61AwRZzA|x2<c{&)in}&4GP>!Tb`i2{RN6!
zo>TQb!ri6^xYN*~va*#_Y>e9k-kq8b!mbl{sSxET;JZz0e7BzJka3UjyDRp#R%L%h
z@!jU{DQRdyWosuY8bv+U;x2z*DM47y*BV;d2`O6O$|{ucf30ih?_-p)Ov|X&OL%R{
zUfNqxdNQEV*w#o8HmRb$fiP@FTX!?h>sh{3dyQqNlK#~iHdqa!0(t%UxT;3LC(yge
z>nDAAH&I?8=}jybHc$dS>3fv{jgpc(Y+!ET<}HHmA93sUHQc#-QyJ`#zP)SLuJ9-W
z8>&<m0iN{nU8mi>a^)JnzjQ?z*tqc3Wk@=UqdTGZ-e|lvKohU^n}|0D84!FXcypM!
zvbALI(YAPRtONR~yP}uAsVxjGs0ge*U|~Oniq8#E@mWY9Y-AZ974%fIF=VMJEp1%{
z2C0Cj1Z`~(^kmdFH6<Q-x#<e@mMvX^b!#_c>lR5CJ%-~aPvOj`r}629Px0wj7xBS2
z-y(necDNN5!6GjoI!S48m_7^POV=Q7;W9+z<RK|Di>fgni{{P5$`$i4Z%H9CrzavM
zBN7F<30OQW8EXpCv8FH`TNdVH>(T-o*t`HYzd4TT+vo7Cr5L|<*5Q|q7X0VMkF>PE
z;l-bS-~}P~lwf=EE71MqPquO|Xr-RwRz)YSS9arS^&?!Z`Vp5)e!(|C{DMzTKgO<A
zHCUQ)6^kP;V4mA9EV5pOZI0{kQScgkLYsO%Jsg(`WASxiI6j^pfD=W*SQ6!m9n+(6
zY<3)WWQJo|WDGVX=Hg%~BPj{HlR~jG%^Rl{`s33TlW`!=2y=b<qsVmxW=^$2PJ9^R
z*brF9r$Zw$6~h8!F(Nn-gMFhhWJ&~v_yl8!rymBpcwi{4>?k%&69_!(z(Cmh2f)_L
z2j()b(T3$n;5j+Ua#5<4Y>muKF?ON``VJp}HwM0jR|oXMtAw7pn-Qv7*upnc5ilBQ
zeVPcore-QN!l;SRYo?8DZsW0?Hn_8vR+S2n(cDcy5_&E7YtZqy9`~O$;=yyqvwA#u
zT8GX@)o3FC8wkF7+S`V1g0h=dxI>^v_0S?pk>F{dnxn<7Vdbi>lhhPZ4ODZ4Tmuz{
zL?dA+QBT!TNBdPP9qv_Sv{I~?gkE)ZC0bgV@ZiCHrEl+_(34}&0EYDE-ID=~;(NGP
zN<|`|ph6O*B!d_wRRoOW=@D`&=t)XQy%J?qU-hkB{C^*!sq29Ppxf0=ih8@-_z-1;
zUU|pQw4gttxb+EcH}L*a`uG}OLa&uj>qJRoH;Nm&l=C|T-aQ3&?SCn~V%pSu1fD8r
zZv}46?QER*dQGDOy?d21f=pG)sHDAZXlX+wVSkrUub^tLZEHq3p;sycQ+zD_vz4^7
z(x+D;{dl~F{~WokXE{q>Z3FLxdNxE21Yc8YwGs_f`Hd_eiTZkJc&O}TRGm>(b%!?h
z29I}ntqDaczVfoWC@a0gvMHu2FCpBn<F=%+5OxARIhRx#NvRQgD>k=hK%)nGw{FR6
zFTr>92EO_32VDH-2b{Zb3C>=@c;o%i%2YLFk%9q}@cJObzrgqQNGrTM#twbQJE513
zp&9he>|kQ!1`7vo7??XDDj`#Wo~&P{+PY4lCo69V^d=#2YB0hhL;r$aXqe>dsW$V;
zNKd8$%fg)5MTFFHY*<ePv|~RG9z1~)gr20Qoc-!NKDc-RbGL1WZ%#g3GN!>Mbs9YL
zsqhx8M%1juh|McNZqW?PoG~2>=M-Y~@_AUis0i74$w)|!L}q3prllugVHSZmEf?zx
z^9jA#*ueLlIJ%21#aFm}^AcLCSpiFV1Kesx!@c{seWM*eT<yTQFY0jN%NAVxvJGEd
zXvb$C)#2Qy4LE(O0v~@|iygZzW5b4vSTg@3EG#&U`Kc$6>$`(?b0794e~vFQ&*N0U
zMw|~>j<4gV<IchqoX>Z`XVcwry1*A(Q~i+UVucx@5m=d$iM@H5*qRiB)lt#3vvJrN
zAB<hGUO15Mj#D$d@Znq+Y)a6?Ebl?cbsL45;jYL}O+;9FAxt82F*-O4qa&wbbVvpU
zc|~BTR~UwS1!07zKZd({VJN{iishr@;SMW*KTM8}hQs8^FnhTUoGkS&`xHo7%3ih>
zwieLTn}`7;Wsixsc$4(Pn}bzh?O{yt5Ja>=&8=+pTd3;9auRxta%`nCkQ<&VPoURI
z*tK@k0(aH2;<G|f<u!Lx-94&Pp!e{{COm#YoB3lS?h||+1YaW+Up?b*=!vy0Q><|{
z|4x-8Jo9fzI!q7rMA7v?PeAui=+)AaRntaQRZ>;Fgr20X2=qETJN|-R5BNm+{nejG
z&<XS;A^A>8DQ#^f6;h32S4HKO5;7$+h(PGcV1|sq-mOxlj}+H5qpF$T-}w-=9rvlE
zXi;lhaZ9W$)m=&JbAs<DRCN7}^3ES|m$vsNt*gA}ZcVEKI8lYggrS63TX|g4NDvZu
zrDAcZ@?=z4as?%drEiaFuA-#_rBr*jsPOLacMA06H<wgaql9+1wvqRITN|pI8hK3{
zc#T^X@RbVar0?#psAN<qz!O_rUUg4V*AkM3R82KkUB_$BGO455t7r58PeSrG>-e79
zS}7k@RM75LSKs3`Cgf;!D`X_Mf@R1?sis<TA#d}V6LNP6GlH)Ndbe+jjg^55f!@{s
z13eY^ZYg<um#^GVa`?`Fc?mAw!K$fh{l+UBh`-uTn~nG+Wf4*VpRCC8&M0d|?e)^s
z(}%8!1x##ZQAAGy&>qo=8H8S@0zIiH$|9nyP_o>RwvIlg289xOVM?D~s4|ElGc%%;
z-OI8Gp`yZRShR4ivg+}sja#v2_aPiPd=e)=_z>qlKZno0x_~u@4!}Jz8E%Q0@XeZu
zh@zF4O3Rl}undVgvyqoyg!!{)VBxGhtXVPxD;5?aJ0lKhw1U~$sVF2^i*m9sFQ)*@
z^5<Y>(LAhPv<&lBtibvs$FTR@_t<ylDoz}`gF{<y;J~`O*t?>fs_-r$S&F5pH<ehB
zdJVIaFCi!DEQ%7oz^u5lSe^bQwxplKnuxtvLlw6%Y$5ihEWnq`HsI#2?YOpK6~3Q6
z8`tK~#fCsH%yl=#sadf&kQ0jSF=3eFF&S%;@-QPT9Z4Rv{0_d@OclI6atc118%L{4
zE1O_}&5^n|mSc|3W;<d-kQ$1dhM>sT6#4NXNSsyxr_@4d#}yHFMHm%UKy{aaAzn!s
z;S~op?{JKE^Th}UH^R;d>V%pxA?FenhQNe)xcU1l3tgI8$r9c6gdWS1>dxBM5>|E=
zm}sbpq2q_(?LqJ0wf?W-onf!yz0t4Y{ZX&Ld5Q&99)Ti#d4xs_;nyUvqxBTvHMddO
z(b@{|n%W64C8z>A8C|r$1YQ$i)=FF3^+eQO1Hsq$7x=mfzNQBPJysxsuaWPq?-b~%
zMsaHiz1pTSs!T$af2)o5s;!NmX(Aj2a0H(mdla8yajRscP>uv<B`Ydbo{ZqCem^Te
zq1OXDiH62TTCZB#FaoZ$j1{m%8LhorN~=qy)C0V7+STeNDkdtp5-KX`&%0Ae09Lgq
zai_Y4aB8FKB7g|IVuI?P0E|{vl-`}D$GF$>6lI-1q4NGODDVCerR|SV-ui&=?;_w-
z0F)4AS4{gWQO4IwMd{VGqNKK&f1B2}QbvgDc-}z8)`pt)Zj?8wpm(pTR_V(t<*}ls
zju2}g#OesWR@Aq(pp-VYvWfpY9Zz|E71dxdN+n-UM!X3-$-}H|ln~fepq1CWkquwX
zOZ!?~OVw6Kh`kH}ohZE8+IvdW*Ow?!M>y8V`>FhDYvnn4AIqwqN|Aq8)VPf5mP$@w
zsie2uAn?T6-gpT>v9n@rRU^3)J>V0S*Q54s+`NrzH*Vw7m7BQm^$&3O3BeohjiA~a
zgE#xD<JG<dpDaf;P)}Ka^v$6rczc8eZLSS^jn~%3L<19LjWBZu59rbE#w2ATJ|*M-
z3B91;FlF^)fu1T|MXYTM5)+e<OAyVPS%@Wz7f|J`!<J23v47uToH+g=KK<l8zP$Jq
zj-37wQ`56x6B-TQj6y`_FGkvowTMfbkEEm-m@$1J7R;K1rE`n0Y;GP_(YDUaO+;Ek
zIC68-kV=J@kQj^1^b8ba7hqn&d@Py06dTv=z?yCQvFYGPShDF9mah63>z7`@fz@~L
z*{&{}+4Kk-@~g2h`ZjiFHDP;7CDtb1#NwEXm=paGR%M>Uj>3yLz2pacz4ki3TJ{CL
zUvV0@Hy=gifqkexz7h8h&Bo=eX=wgn89tg5O*N;5{9t>m%S*tTj5I8bNX5>aC72VQ
zhd>igMB8{H+sPg4{M~VWdKfOu_s78uW9&;ez(<7Lheh_-7^#C<9uu%A#tXA^G7yzj
z1jD%L7!x-O!(!)Pc=Q~M3Ym`KlhbHr<1vCD9qr@`H9J>~wRMD!iyLeM0^vij`bS5>
zFDz79kW*PciRGqDq?gr59Bg1|Yfd$<hhY;2<DH>z@y2)+Zw`44eaF6uLF#W|@c1|2
z6=*|SQ-#*HMzm8IbhI}UJhf=;AnZCC(AwFEwyp-WQQe8cYwZ@L$JYou1$bSwl@F@W
zMn%{8xR&6nqs48&<DZ)G^yd~l`iUTVOn}`N=utIv(<%x0+6lroK5na^Wu}7U-;=?U
zww`DqaA~g_UIMR2<%xxoK0T>WGLTVQQ>j3&p+QD(oB1A9>WV<GCxxYsQN_n)w710s
zosyGB@Co!v6~#ogL<?En)Jo+_d))p&0iUR>+k~EyAY4iCQLU-KMfgzl-Il&LLaU_t
zA&>WQv$hMj>hDwGJtp{mMCF5@Q2p=)Y9IWFiZ;H7V7p1XEB$<T_&uWV1cqgeT?8H>
zNEnvzy9Ij1RGXqYE2!@3_?v1wx>459ObGM$D$uK=Etft$+ExOun6|y9i49f<p<FLh
z%xZXzoB2Q0s?;5?m!tqyQYBZ(;1I8+08dnWGcB$3*$MEfYVM$}p%@KKRN0J1Ia2ir
z?5eBpFm4le0zOfBe1DawygNLX**oP*W7XPHhnD7AmQxu@sq%WDcdz)S0=~QVZvLhG
zZrl{$T_xZ|-KmscPye2h&Z0o?7H-_SgR9qX;~T2I$pMjgr{5^N-iP4pJC+t#osIcK
zW%Xqh_zdyZFjKrU(h|L7iEZ^s#xSyShMAo}&mM6pIfzY4Q~LD8-U{?ot0>Drh8}`L
z1bShr5nNWVh=@o;N5>#JIR$xnxyr(zOBOH0#trMSed{(HI&c`LKRAOg&R@it4?jV{
ztobl^^M_|l7N%y+LF}|8NXuS~{LDotOs1-vu?WlN&B5};GqG&JG|VDUrX@!pFEs|4
zSxJaZjYr0eLgX))kD2r5Vd0Xcm@|JVRxaO+?W^}=|Ek0IaKkBF+<5`t?Y)At>n~wj
z&KWFCIF608zs9lErTAn)Db5u9fYXKNaCG`XoL;mCUu->qOUIAn=DCk?^UJfi^W6p9
z`Sx>^U%r6aE9Y_V`~ehyx)q(*j^kPRIefZv5i*1Q5a#BENbex5C|rys*)y>)B@fA7
z0Z4ImLz2}btP8WpnJjmFJ<A856}eE|SrB**_;i6ccBVLC`ebuViwQ+SS`J)Ni=dM<
z3!@U}LoIzdMn=s+f1h;pbB@MHpLh(L9El;WQ!v`j1v-wdF!S<(^VA>(cA;@Gm>lTO
z#>xSfmX^w(g|l=~u<T?yp(Pcbfteo0YLCWyBi~W_@LuWP3-1hn9Rt<h#t@A+FnHXn
z@CmeLOHX^+Mr+7Q($&#Si&&5LE&{HriGXWnMN;+SwLfS;=c7i#t%0_cuiq#5x+)aV
zbv><BtgJ-WlUh8Y(tASaJ^rNy?aye1A5l$G@ijlBq9Y^~SW+prb=RPa-`g&Ig)LP|
z#c7nvL(sKI1(H5R9tC<bnDNhn3}y88CG;e9MW9z(Tf_6l{{g+~8iIlLw!ErZ>BAFC
zD<iPQw1wqVOl4vp>zYv8($3%YkSg{O&$|c?6@>2Ayc~d_Jyq<i3{Fsy-KFX)ZSF>8
z`xC0U$4ctTo%(KquZybh0ZKZa5PXE*BU;|hCsc!-1Y@fba$eeepPze(G9HT+=!v~$
z@NbI6Evpkd-HwLthp1>Kgk`{mpu1O9LnzXUODO*jZ9A=dr3_+F?bf$8QAIbPriBWw
zy-8(fYf4c;J1Zem3F`^G=GJPow24BN)F0`uE1@cuC?*8&DpB8XkFUvKlBl{nO7uK0
z$J#nRCIsbmvAsRuYm$rrmXknFmD~3>_+<8mguqWVxIxQHtNS<f1cW`%yP?e1xOw|7
zuHU?aAFka-SX2@Q4xfOxWL1{;N8`2kd%&kj@M#l#lL$Tog3koK#;EIJyteFK#0q5R
zP3SoyJ~a<9iD^^^e-C8z^y_KsOd|AD_BJ$Jl9YRvkcvf0N~*HBz??a=v25vLY}>k-
z)^-<;9zH_oeTcK4eohGPhNIJDI8Tm+cX%!Wl4l}3br$XFGUO)|U`6gcEYF{X)k_v&
z*|OP~Ju3_8NuemDwJpd@L`-}rvS;TZWp+L?=FdP5)!>rVD+#`pSigJ=KH7c;m-b%3
zjh&z4^Tmg8YUW;?UGgD5TXGKP*M5hu4;1604d3JJ(u??P{>S)g^@sRw>rqsl{RAC1
zzrd4*@A0(j8ag|!;K8H2c<_*RtLj@6U-<}4x6k8Y$py4t{RG!OJA@B+Z$o@QG$JR*
zBPBc=*>PDYOi4safIH$`%&{cW9ml8p;oHSQ_<p`OzM1Wc6B$<6pKgl-1zuQ|;EdTZ
z{zwcBLsaH0IHb&jLC#8yPFs#)iHk8HbOzq{N+ak}G2A<r&<n>94_}P&@P@9d2ONC;
zF@+6IAQhf$v=kH?3=a=CSectE+Y>t4vh1u?%P848+Q8V#h`<|zexu&QJ0o60uR(+!
zqxUF6PvcDtobU<;jOhhWKXcyrw5QE=R994VtSGIl2%@~2sPr0WeVRM#UgqGjl2F~X
z-j`8YNsl1xUMf4Wu>w39xR9efC(wKHYb&~5G@zMkujwJJ)T1i2Kd#~X2wHwm_agxw
z&pY_NRGiICtc<KUN=U_|+LKXafu3S*rN2)SX{lU#=HSZ6tw67h)=r?;)YQZZs8W1#
ztf3OBmVCW>Nz)K3NXQX(m9(?9RClUqqtzy0Xd@fi?<@IoHO-xb4x{A()mS%5>N{0R
zk2dsPV>^LG1=jTh&4gO>!=DHzs=L-le7!?SR=(SEA7x$7P|ep%3BB7?bvFpNTau<i
z(3Q14LV26m-$%;e#yx(ngs%zs?nw?`eJlTVrvkl_`bIuir{v@nQ{9zRQ_T_P0zCm<
zHQ`=bN%+z3E6{7IQ>GPmJm^pcDr72QPe=+%qYP59k!fZKtOR%hHyO2W7KPV%kJqu7
zs!f!(s5p5~u>w1RVk2K`YA)gD%Lq*Yo%Hny7zKbY`(g=ONjFl>;7}zsOMag!#O|t~
zr|Rbu=&90JWHeWfN)F$xo49)I8sBr5agXpTMq)}9Mo-YidxO;Qc0WSzy_ewYHvzBq
z)1af#BKRbg#ehmr-4GM?Y*`_EU}VY4WaOme>BS~yLfNrOfu4<$uF{jT5*!+$*xLvJ
zA1hjTco^d26Of*siRse|l*I+stXYk9Ygg0S?jiIJ;r!Y2_~gUS5FQ)@8~z>dDe0IR
zIg1LVfXZzarc?1PrA=L#KNA}lEWnCow5+S<Q~gBK&c<R|W)kwU5|K`un>Z~V@dfe7
zT#&1*_ce3g0xT$;hl6W&;gfC0@#%&W_;B$dT-$aLx3_(bJB)k#E}`l;rT5_ye7@{c
ze7Explzny`r5CQEyP_7qJ?aMj^&Efvb|3ifWBmDhCmyyJ<KCq+sQlq0+%NeGPf9ML
z^4vb$JiZH`ZQ6w+EA}CdmNURH0>SQ6kr@$!g;@c}4$#5U7&9Emb)r3<itiS9;zWu*
zPG?$Tf4mh=&Wy$i{x8|d=?G28h8r!tMb<JH<!{8qjI|gMF%Luh@-fsW6N9}H7*UL1
z4E3a?_4J00rzc#3sP5R<Ol5?;>}v4#@_?<CC7f7hPBwOMWXRlF>B}>>F^8tUCI*ff
zfY%4Tssi3H#*kikd(<l!s@ofbHDAL3wO$AacOdjCsPw4f#M-vfB1vjWizFY{p^@Ng
zf6$1Ihde%{Z4p~bl_u#aGEyrx_W5t^3giTK{|xEhd-7`=9{$>bjvuKWpAwdj2~L8s
zxwD+sxdPn}_&1tZd78`k8wkWUzE0TH*RX=IGB)vCMv=wZ3h=6`?)|0oYH8bQ#NyV{
zX4Be9pI$?QDqTf@CyK8}=~WOCwGEAEXlYf_J7mPOhIX-@3aXAFxo(y9&8ncKEay?I
zWp#50YTF*5w*3)m7?lKF8C76q#{+`=N8JDU4?O(o5B{J3;qhN+y8n!pwG($5+Hr?!
z?lzTRDZy7kt6P5m1&TYK^0@~pCD`_a3h+6KsSrz=A1P@pGOAnFbYFpAX=4ZKx*nst
zjjE8)D<SkM_}i<S`1{1(Qn^-8!B$e$ips03t40;A?LX5}S~^u}C_Qtg6-&$WmR6Pp
z6@5dKKu!8h1!N@(<l0)<I1y%Tt>x%w7ZojlD^=ju#>Z{#6%09N8Mg8BFJD)Ik!8k)
ztEbE)WME1{rW96G-g^l^Npe<&%%>LciPgPx=cY2M+XFvQdP?8kE!DyVHwnJX+<X|C
zSu0yg3i#f9f27jC*8{#D#rOK4N$91iV*?!nXP8?1!`RXnlZ;%Em{x?Cge*eO^&ila
zJUuEsU2PTg2)c-fe?l)_1-<-2LQl13-i{sHuy@Z6oIG(99~?h{!~2iIno4qtXAGuH
z&Ou=COeC|iPb*l8xrOtwVir|iVFA`JnU4*dS0QiqG=#*5BT?4ppPr2j9%E7?6zEM`
zHXV733z1bg9Wx8(VnxAXY@f3ZTV|}qj+q<r*#;`h)u&K#^eQ?(V}&^R18R?5#@(%7
z5Pnyv*x0)Bg0=9*`16mS@%x`I@Gty?|KMjleB6lglJC$^_B}c)F5^kn6|~*>0wq*;
z7j~?`2P>A~^s3$1w_q1ysK9*fJP_sOf?0{)SX<zZRPVkxI^O|j7ZQ5&op6T0+aEaz
z+oCM7F(rT&I34L(bKy-3Yn8kZra5b&o4XlmNvkm|cs>SrPQxJAbPV@Q#)!!=j1Ua-
zoJ@u12}56BxP*qnkFfI%no3z8iRjp9O!o04;H(KeTf)v3&UOy4=TYWMTUc8{$4D1L
z#ty|hL*K#cLtjU)!M)H+pvUMv_BD()?uRi(eK1txRrrS45<XSvAly1>ZQDCoQC<#O
zG<7ym)io1z1lGfPf=ogbov1vOg?&aPC*bQ*dYw;d_?jp|e&-`v*Jt$v11+;akFRym
z`bsWeTXz-i^ShosClLAf+o?7MdfgrT9vM85!4z69v0h@AdiwOl;);Thj`9X7nR-_8
z`g+MN<nMk7J!SCXe?hOjvYHmJR#AJE3NQo~e2p-vCP*p>L9w|K${+zl1~w`i2}!EG
z8Y;iq_J<0D>K{Hs%d?*p;63^6-}vzl-bw!gUi|(aJoxcfG(C7q%PT1?gjvU9+?7QH
zx?iC3(XXg@_zTK-E`5Gw9nY00Z-0)8cEYmdF@gFBRkX`h?Svga+wg!&k<hDbB#aq#
zt!-4I9c+}^2<#SyfSZ<<Q75Cegq?&WAGfeP+q!t|AGETJskV7tWpGNOy`vtT-Hm9K
zG$b~1Eo|i4SO%R`+!7tq5Vn>PdKI|eU5oqo|BkM%Ds*;I(RWlR(aQG;D4S@F8wtJ^
z$@3)m8eR@;NWP^cHw*YAk-2A9hccCo*0-$up3=uB%1;(6C@v-hX?6b|;J8i@-ci7J
z^VS_?<xYc@og?&&Eih`l4&EIwS{c~r)q4cNH;&NLz$-5mUoTw)7eddC&<lit*%WB$
zIU^};mI6Ind&%IJlof#<m4lNCdLgv5J<tmamz~hUl>IL<GBQyxeFhfHU##p?yJ5pR
z9N51LADlRhPtKe{On59DEj{4r90`9andq3s*tGdH*00-xr3;o|;jAJoUpNyR)-A!(
z)r*jrlZu$kR3zu;A~io7k;yTLPK`$P99r9ja}k?24H2QS$Ouoz%$OON8#@oPqY1#w
zmDo9dD|XG@fG>BRM#H)5s62WRH@2U`H=7Ouv=;c|56XT@blSF`{(OQ~TC19tTc~Tf
zj<cWdMRny@czVAW&pPg*`ugYi`s6<BUOFE;7R<vZ+jnEvf;Ct_V?LZswGkI+fz5LQ
zad256riYKg#sXv9JQ;;kvkh@7ZX8aBYvWjqGxq1DAvP=#-tlwbki7wu(l<bhN8Q{l
z&`er~QGp9E#JLdtZ89->N<N1BBw?s`5Jr3ZKzFh?tOESt85)7W$QT4i%7*E*#W7I`
z4xXwkyKQZ021kJ&8!Fk!%+XP%5jw!o+!UiU$D;Sp_t0yA*xFaniwaMm*J~)1p4zJz
zV>STl)`Kuw{~h>6+M$&IYVRWSSV1~u;6nP>6e~-+`+!P;H+?G=S=STE!)sO)oYHUi
zV*@LCJDw7151-egi%L#Hz$Z3W6kQv^*UtC05{Rt?Ve5T9N2S<Cm3RL^tpdFV_v;ks
zwXx!L(MosAz=`CDiM4GJyDEJ|D)1@M!1D&a-pESaNTo<?s~Wlee?zadyiy5)Uy1bR
zmB}DRl>$E*43JS=nU$f8<Vwi<_<VV7qcRt^oR+qnmbSY60U91XN5@aU;V~ii<G=n(
z;QbfB{`p`0@-IFn1iOFy4UG?fL`C-#s=TMTM~Ic({|Qx(e?#??-%-P;WmG@-1(n@D
zp}PBLRLQ^x0a)4o6jfA%^$%X4;qgzXzyB1~tyG(>?Wk)b?AqEH1Rq0ySJx!!wONUJ
zs@*zX51C@7jK;S$p-mdHZk8(>ujY1MD>fo6JhrlNk^VeU^#VPywW8LVMbS}pw^8YJ
z3h=tBSstqBQW__!yB2vL)ty*iMw<dXeqMo|SmFu=d<|lA2|g(^vDC6+k3dhgSV2$x
zQ}xNYtfq695WKC7?q0j5N@tM;3~t{RwRaO)*|~6ZaZ~nH7Vr%lqltI=kHRaxhtcYe
z!R!4e;FZ1;(5tVyK+hT4`mPG}^h~^=q2r)HFFHQ+|3FXH)YH=W|ASsAp%?!Wda}FO
zQmkIR2D^6c!2bO^aQcHIShs2=T<zW9?KBm37P9K{Y|LG75J!&4;Kl{ao-tFgv`d%H
z!kYDqkv}UBDbq5MIlBl+1=9f2KrFuyMfim#$01`zF7g-5MOb=10wOXI>>Y>1$*IWn
z%R@FnS`a=5%Lu)>@rBqlo$yVYfkQKv;j_h?aqsZQDF6IR;LqRipa1-VU;q0Pez^S&
z?MkuIpY;6qHWXKWhuWrVyg{y`r1(pG|Mf{6*s&7ZS1rWh&FgSx&o1m-x(K_L&BfyU
zI4md#!RonwD2P<Uj@f2dKV1hOZg#_0`+V`id@EX7dmM;y$ClJMgm?wPJz)_n^L9Za
zdmA+J_d;vh9*l`yi-BIVFu*Y%Bi)Li<~1GT{Boc+B^K&FfiU*>hkLkesvL!&$S6z=
z4Mz~wUTj<(ffocPC#pJ@QO`yzj*hZ<w!JcU+QgcYUrQ734j+Km2KB}(gWn|ZXm160
z!&T6Gb!0CL)O!bN=KY{%+y{a2PG}~G+S%%NvSM`JZ{oQ?uZk8_l8@^Z=t%;yK=08D
zLW;Jw^HCKZJgdczw5~t@A>eDm!ygGOJ|{}A?V$>KlFHI3^KQFX5hS%mqC;#ERRq<-
zBdW(ow9Xy>fL<4&r|Mjn%2-W+{hgewq^`V7i4f>DH>rRpgBYUr1bRKdlcPYdiq=+A
zRf<bWS*av%tW-%%E|ElKdHix<Lq>CzDQ0pm$Co*LrIIgK*`Un6xL4n*vbIlu!h>J`
zB=mrvsPuj%@O~5U{Ruq&_1|cDN}xTUn!Ep;;8R6e_m4^%OVxv)QA^cVOVwA^^#aul
znZ{N@yDR;BWi7PR51ym`5#QfNyDOHKs;-9CysDAboc~vKBkeh1UtKSAqNTsG7Rm}a
zGPqM$MGMPDp;^F2g(qo1EY~I~Jy|KIg(^?dkYwbxPV(`TzB-Bm+Ejs_SX@zh-E8<C
zbh8m7{8Uj+^;gPrqMB>E_gAzw(-uqr9zWkoV0Lw|p_An1dV!wUUj7}zuco#{0iT3Q
z^~q9HcL=x}EFV#RcUd;Kl@QA-qr2B{Tv2SU%-)dxJsHi-%FdxEb5`<EEo|&DQO^WJ
z$4tcA{YK%{J|pq!dun*K?|8i0kI>VaWCtBXXIR)%>6v>&bCNw$GG`$wF5@rg$^HXk
zZDlH&Z2cJWaxFappFl4%Dh!E<veQ{M3JYdq(Zc0adK(nz?b)*hM-T5odU6t6?c88%
z;*JpiOl0J&!P>2#;K-?OaQxIcELgY*%a$+1YO1^$bMuflBL_LNsOV-)N5ZrmL}g_l
zCMOd`%N8M{a5jR&axv967g2r%Nc1g2qE{|beE6B*eB_7ZpwK@R+fwJ?XvTcTY-~@>
z!=*#VP<HhSUOan%=a1X)@86%``O`L3SKY>wXPs>2@8Cu;r8DDJ`3;;o_W^e7-GJ>o
zR^iyeE!ebt7S_$r!``(8ICWqewyqAtig{s34w;12`9?Uo&K4hUx53$+j<~SLAG>FJ
zV@+x-@}n~l96t@7d8?qAvKBf;2cbP{FVs@kVVM6S40I;&9A;p+*BlJ@FT`M<WQ_6*
zhoN5tJVFV}s6<3W#UYI9E|?8WkTUOC7J3MShle|?X<zMR#*ECwPzEhzQy(W7T3Tb=
zq)F&EnuR_{RNh+z-rIO}1doJWuMsc7M*zM#t{3|2zK&t~Z(&NT0~+oNkO&?^uj@e*
z?NBZ4Clx^#q0+^QLrc@j=enLq4qh{1C!@GM&}*j3>7e3k#j{_U@%Tj}l^|{KW0kU#
zbe3i+$+icqJha8#w6t9WXeTYweI6fDLH_ux6%X&%E7tbDz?W7`W`f9cw5DcRK8m1|
zj`5fFR%T@MK(B?CPJq_~Jy|f}pU{*3J(-CiQ_m!1rDRD^E-n-3m8sw-fGZ&sC2x=C
zCFLq5C_t3N-;zpM__JPto-6`T($YmJJwxY<-|>{d`-#94rT53n>5Web!qyl6qN@81
zm5lQHgdY`ONgIJCwv~_TAO4KG`!5t#C!@bIe^*9vC5@%Hse>S-&Fzvj7nzP$i?TY}
zZCdlPT1i)_K@~w)OJyg(lYTr&Jg%&%fUJxnA^n|=jn!ywp^B!`>w47AMog?P%Y@Kt
z<9Q>&RNqAVN$5#3dIuqQzq^jgyI$F9L7>+ueRx!M0=o`rsM_TC&k$8uqO!sS<$c~W
zPae??$bG}_ZjuCNg0D_;{Rmkl;az5bP`Qikl{ve`B{HS$@6lbcx&l2ps&e`M&f~j%
z`xbI?vQ+|2Hi5EtfSEN}rEi9TBQ^2*dm|N_`^x)jRD6Wq_=zT%WGs~=0QzR`gq|%@
zvt}VOHcf$^y@R8YvLb^Rs%dCCgr2OfqJmy{L<pjy!;zFE%Sq*+sBkW!w*nhCZf2~*
z!Gk-ne(e$j`AvbHxjlj>$00Rw9%e4siLHmez=1Pg<KTy9v1-ExELb!TD^@N-eqk1-
z&n3L(7t-2hEA}=vD-*G#`ShjBkdQwIQT&e3u(=2inuG8u(-BADCHtf!Gav=2UJ+Oj
znuf)J@mLp<h`nj~_;?L}%i1-VJ9`Gc{{A9W+!b`U*WlUXPTrFL!K25|*wQ_~4>#`O
z%=s^{_t-&f-M10@j&H(|5BS+bi*Rt?Y#iA;16$TcWAS`@<Ynt2CPEv@;hH$G%nm1(
znc%aH4miHr2^;3QVNO9f(z53ublN8PEk6vs!mTiveFEy)`!P0gJqG&DLVw2sjBuHa
zF+TG!+Lz$-OUH2Ua9D)L!zU&Y(Q&DWk4{8nXcR(&LuqNLvZKOz&7<M!Dtpvg6Lzwf
zi9M`nYptmA%<Sx8=ID%xre+wetBaA7bkJW@4X+Fn@V!p((c->@-m9beSl~Ch7v7rC
z3-4+6f=ieMD_;egc(b<)@E#I6GBPXucA~)exb0ybL!igX@Ygi6T0Ew*Q$ep?0pG8G
zcGBu~;`wiFcu4q3|6R*{>DwddsOTO(ZNj5xO?WJdk&hogtY-yj#Ea)`JW?qVc->u%
zO1@qLRh=wcT1W8pD7=66>508<p@nO0;qPl}QPiH;T!EedugBU->WZ=~JmDs5=*cuR
z0iUcaEFmi=i~18KC`Va!MM+}jYZ9fjx%bLt1_V_eZLW;k-fQkcMdxEQK7N7DAAhB5
z`17xM+&}*txc|$)(e&dVsDA#Z5_M01McpIG*L#j8D!{Iv|AU95WBc>p(a`-IH3VTL
z)m=Hiw}jtc+T2bx*-e$#iQBcZG!#L{qX4g@nl@aP$EqdhsP3xjWIiy@YpWF4DQn`%
z$|<r6O)alkLlqmbHgrGfpxUDamT6^F(K44>rWZD~(y~&~ce4Du?@QML&tLZIb++?0
z+Ex|NsNg!w`MRj_^8XRtRP3S%A3dnYvnS0g&vu?S@^b_xzh4G6WWYqMbSwXE2OG8a
z4we(aCw+aQ_<EMD`X}^6@yQ(B9^gsHAV+R)He|a})&AUcIu1@SwzS81ZDaHuI)N5<
z)XOxM@#r;fq5+I7U106(he;;R(9pG_(kmkLQuzhW3iJeaVwrm8-u6H*LV;c|D<IVl
zm0UJ`^^78!F18XYmaoE|-8*pP@J<v>&w!Pw5hA8WqA+72QsU-d$=XBMdHPFi`rtEs
zc=;OEY~P8cE0&`mF9)+{6(E1kG~~>khV-HwWYFHmr)3~MGY@I=79nBY3WUvAgMiGX
zm=aL{Kfg4jN9I%Ir6Yf8Jm!SOV#oAC%nl2|%*YTVPw~RMoD6*O!Et=>!3oS=G#5+P
zF2EPxom2WlWYK{~&wy(;TXE#XMI7UAI`Y8*>^Z!G;LF8<6InR%aTX4ripQ!=9w?e?
zN^9<b^lSrU<Qrn!CNF%n!4}_caKqU}Q?X@QF!J+b5HYh5-ivm^d&@aktUCeYrAIL#
zVH>o=w_~X79E^6F2X+6Y&<I_Gk+ieJY1Iuvl4*gn5Scv<v8ic@ij77%|BuiR88C|A
z_2y%SpI-ohXA28!E7(!tSrK~Xgq}Ie$(Yd7GPlG8LQm7u61om{(6BMX0BuVAk?#|F
z1l@3gZ^SEjZOof^RqZvrqSgy<jPHfs>b+neWQsZ}vPRypEe{%4G3piIbv<TF|Cp+c
zH*v>fDm$vY?xzh(f8K*<b$Il=PJ!OjpX%|`?=AS{Ujn{P+Pf~i_@fg~erZt#F{J<W
z{!@MrzoVO=lhIn~%X>(WN+o&BzxR}2eMlQE*0xnT+Nnkaddli9^(t%I13d}p(`%u1
zYi*&*BlP~Zw$i^xmDofp$M`4odVu$T0LXLYIX+fkSRu<Q@iC8OJeG?!uB=t^=9C#2
zO<lOt+>N^}-6(B)fSS%nc<|y^{7j`M;CuA*ztH&TCsaM6y?y>4RR8#2G!kx2PkvG?
z@5A5zi=TN9{>pg#>wnQq3*6ZK3=LgRP~F;1_)+Dxck_F@X>(ioH>vIz75rbysagem
zWs-A9=vCBIE2FkzZDj#iv9*%MBfyhT61eNxXtc>dR4dD`QyG{N@HG*5l5{SqNnQ8Z
z*bq`(RA+5%ROPJ|=pgXAI%^cQ_lV{7=wS^W^S+SdLzZbz$m9Fnl~n2iJ>CzGo9L)o
z2tgT;B5)<?T>c(`r}Q84@Ag1X6rXC4LqN?kA@oG;^;lfh;D$_BQ!P$#^QNS(T&KNF
zC-hhsWG^4q1#5dpSlT(mz|0P66n6cFYbjIIUVU#YdTC5DhoyrLY+VDW_#82Dk`>aj
zidfl_2|byS@lWVEDsyjTaRI4_kz#Eb(NrEONh!$5nueK$3$S?MDrNfFh7IenXYV$|
zCq%;C-3?ip(-9t$fr5f{gvn=Ez2_8mfA$p)oWF=u7rsK#y!pt?%BIqrj%-@n^jU;l
z5pC=AX^2Xr;wxH+n0c!ZvurE8@|VFYb0M6Avfwi%1x2X~X=}5PKP4Kg5;L$aGZk|Z
zf)N+s2se8@1Wa*8Y-|`#5PY&h%Fd$)kT#<LxpNm|$C1yl?ZD^Qf8tww{KZx5IdTyD
zkFUkaPZ#5(FJ|D-sW{A7Xo7-y`dGfv8M_Y#<H*q%WMpb!<vM4aIT?-9o4xSy`Y>!P
zOu(E%+U=z)5xx5aJhq>M+1gJqZpKcGP2YqG!5c8jVG$;IuEa$5`55IfjkY!gWBuY_
z8JCXWoI)hz6(A-#1u^llw6!wTZz_Qosx0Ub86J(`pfFlm8#p>SDR$P>%0f|i7JO_-
z@EP$qL`N56Ei9o$o2%vQ1P$H{W3A0F$Y3Jg96y-K?j3^fRrDJ523{Te2417edy^{f
zZLMCg3o}DK0oU}9pt#?l8nn32R-RD0Pv|{(+DzcFQao=$H&t5qQ-bR$Ezpnkxc{7B
zd{#}sHR351-?QIZcvK}MKX}oE4!%$N^d3H=B^Gc}4YYSuD$ylYimKv%M-3iOX>_x4
zw2ShP{4rJrS<<ShwU(CXe~#SB+HK8E{EZ6m8ve4kP12!Ht0&NtC8m0yCv$Fl78($^
z$+4uQSb^O?p;z`&?J4R`Ajpt37iEf<q^~r}vQDb0W4G(ul(^F%vntxr@$4r&r?-9a
zC)L}tUr^EUn0EF#s-9Bi5qvfGpQE;m%JAWHs=z=0g5Cql!url9XrT&hWz=;%r0VNJ
z<Nb%I?;`a0f0PsE6;!Yl^^$j2r|Qe&|0>qDj_1nYMMEv4hN@1w1Q>*$L<1WKSroK}
z*R1t^3!eS@6i<JCtgNQe-rdA9X;Zp0TH30We$<B#o2kH?@#Jwcp0crfLV$@Hf6!e;
z%UemfQRQ`%Qvu2#hxFZv%2PcjM^Sta@3TRpVpOR_vAc8-$_TNPBh?;NpEPVT>qg32
zaxVpXqV`mId;+{%FM%gBIBw$3Jp!=guCnjO)$2bXEj?M8VkywGv6Btk?Fc?PSbJB3
z&mN=I4bgi5m0q7w=%r`o32Uby*f<4Zl57^NYl)Qfd_>14(IcyX*R#TkG9!b~3l9rJ
zRCpk*T>zrPf)E!Gg>-^uS|-(1;UX+uv=-~vZNa83+p%uzR)oaG!7U&d!3mkjo4+0l
z*B-{o4Ig0Bu1^%;oV|1lAAk1)a^@{SeC{+>`e{fe_>zk9kyx0II6@?P=0Zd+SdY*Z
z`w+hR2&T^32+z3L2#A=2xUdD77Pc4*BKfx?W?)G~I@af8Vp(o1Qlni^m@fk@zHoK7
zLI{8Fo`XkGM+<T0+nboPb{q1RtVUFB5h8MOku)O{iAAAUv^5IbPsHKSM{xx7R20p!
zL3+9g!ES>MtNpQWvlBi%;DL`0OvTAPG1##o8FQ9oBWL3#WFGwt;m0q)Vatawntu=m
z`TH<2W<7?u%)xNGh0yg~2kpQGQ1eQMnrAGGLsQ|KRe;#SnMmUQo|KW!MkfZ5QL=w9
zuUl{sA|t{PL+Aws1i*z5vl7tB)|^iEl=8H*wzjZxafKlprXjkMpk{50Q8xBab8^LS
z9*0`kV4R~XG@RX`?e2<!CYqG_w7X;9#_JmIFkZ#$1m2qly<in-f*NI|ly<Z}Zb!>w
zT16_bHr}{hyy5RZA-o8zc3RsGD!tBUR0PlE$d;TIxBV&M^-}{L5qc8$f2KwLk+%3b
zRUV<&Lg+o9N_<3sK4!?E#Y2AoBdSUzs4~RXcC}MwurkPKZF37Ng!JVx1bPBHWk7?f
zMHZ`UR#t+M`MFf3o&3!$0zI*}sy@9&QGL?iRY%~l(G$2yl$D9X`)5$uF)C<hD=Mm}
zG-R3@fmbdHO8U!MP|c`mZbL<b%!#d6=E9cNH7m+bmVGL&Z$ZQTCrZv<%d?-+@JL2s
zpU|4hdUh|U`hG<N)m(iyA9p<Bz4#PuRDNwQ{y@{?pHa`(L<KfId4V>5zU##=yr$2o
z{JK!i|D{Cc(Nf7t4xVb|W9hr2m93RQ3zemnrSqi^udbfgiq}Y4i%<r8ns_Z+C4EJ@
z7VhJZ|NMr3)9d{D#|u1pb{~(Z{GLcWX=mBA;Q;~IO-PBwl{A&E4ywME5(2E0jZ!Jg
zi05r3=x8nGxja{n`vSX{#|K?iEK4du#(g$w9W4Z0V<}o1%annQ4w(TW{dx_h4C$Md
zM03^ZEEN@Z*Z|#Rs1ldO=H9t`9k=h^z-<EXPU&6TDY=7NcW)szJq4!bCQ6-<u?I=D
zlfgV2I}rvh&^NZmh_PDeGhifo>09{1$|)4qj#Htd?|_Lq7D%Tei;hhs^rRo}f3C8k
zttCn?jEW-=Q4tI(zF69=w1k(?E1FN}t;G8E8?j^eZmin45&n_U2#QZbbk<B1EZvSJ
z8;@hvy5rcn`%`>$?h4L-dkaTD{St}OX2Lr(mdb86;<BeBCWpTxpQ@9VIc&}f1TEN%
zfF=78vg8l~vQ{E2X)yu<3TbVZU|Prm%nq4@6|~NC!&9(4Gac*Z<sd266T#t*@C$Q=
zji(LF-5e2-nTsVmj$!x5-(laU7cpz?M$B2a0r^Yjp>TNt=5NTt=3|*ye=rhrSG!`_
zdOu|6IUz2_3K7BP*s>}H=Z;6?vtuFn;9w%Q?#xBex;e<%y$6Yeoy+D=V72Bm=*>O}
zjl}I3?YjbFTo*&ncRh@!ZiCj8r5Np*2Az;JIHycU*z`F_nK=iUg@s7TOrtW6rK*$p
z%&Ohyl&%D->(Jm3_;`B2PBve+w}!K;Bc;3@ENyMra8b$=bZSN>7-eb>b!Rt>bMe4f
z*$K_g6+>-YFw)iyBduLAmhe;ebU+^?O8v3D3BNb+ruN%-TmKEbLg<-A8ljeeYw2o7
zGc8cFK#!`gjd1FIB=)um?X;zB1Xnw4Y6qd$DWkUpo8;bg(B?`A>>mBrO8eGIHPeFb
zpQ$EZ(0WouK7LLWNVO;GL7=DVBp2%>3Xh+q-RW%SceRNs;NNPef}_%FP&&LN7mpQ1
zEbdF_H8j@n`8vhYc69J}H~$Si<@gu$2(N!YPc{$wpP_nQpjRb%dsULpM>||qkLo(Y
zjw-IUsU4L9yy{xqB^V{2PiAkF%X)XRC;$Oi-SGgGoexn>ut~pNRp(P>6t=4M2^HIq
zyk@`gTK$HWCqJW>R<@c^+4WSJnpVa0`p1Oc)1UEBMuDjSU(hDEK6<1myfVVAvR)F8
zRm(zE*6_CycFHKN611}~Lw2iERyk%vQ7_l6Np-BH;(PeG8-M)yEB^KG-|+1DLnS?_
zmDgJ}-<g(^O1p!qTWqcL=ZUSAbQMu?52^4Tbd@m#Ze>b5=&IoJ0=X&$c#r4~9?{}H
z;AfQJ=ey;*S}PU!b<o<jHI);BWeNnFW%ZUisytccMfO6G{Jk<+jae2aRjEB?fdYc>
zCe_|;LhrUfulP37SZ_=%OkrU~=W1=KKu@}VELkV4Y#d-=Z4W&oGmKDEM=xWWK-jp1
zE6~%{vxla(IkIvHnwSKBP&H!P13f!F7U+da1(eal$Z#Gbs7PXwoScT7oIK2&Nu{-9
z0oJTtitW2LV&&T92n-29Qd&Opi&kRE@`G5oWFM9+-GL*=&f(*8mvQFoCCp#48Iyfu
zkQh&emso(fL;^3h2;sT&5L&PVQwvwXzhE_jW^P9OoZU#z--4vL#R&7uK`ud;ADxdB
z-)Iy@CSgWg0;UrjGjh}UJrVE<odVk+AK1r+z%?@oUU|6)okdl6@H8$KSL5Svf56h!
z8!>a%9OO;QL~edOt!y;ru8c!Ufj3g}eGrx8its2Wc)IH&JJl2WwxnS9-W<%@T!7?l
z%Mp9{FsAG}3FlR(;J);8Sfzak<M0!h;Jy=@o;zU}ycb$jcq3isV4PP0ETRkHn>P=M
z3sxa(;R<BVn2przEW{_qAzFYJsoGCCEIblX(Qyb3iGa7K4}r%*w6IWZ*(iM%EE_AT
za0}X4O~P)JfeF;vx{bAS#ux`TjCOQafH%U{U5Q~<E*R(PiBZlr=wmzquV@Zryoa~-
z-lNKU6R(@}f<vkeD=HOMcPlL};qj=6pb~Z0jBXyKPp|E9y#l;W!mE=uw+DJMy{!8u
zsx%%2dNT9k!7nPc*ZmW}gKDqiNgY*2BNasx6$MotRbD3*gjgTx^XY1*svz{bI{00k
zge+}Xr_A}4l~!Jk^7ZuTNnf6_dWQl&=_?{w+uQzv-e1ATJxZ^liZ)fO>Pu_;H|YL;
zUaW0pC6yDETy<?DgCMJGrL}ECReh_Ht0!|^1$gcEAK~$jzp#;bq%2AxeQ=ErpP{6s
z6L$&8s;(!fd+-9)ozGCw@)(t^kMZbtD3<p5e}Rsl{)M`SFaCmFwOHSWKcbOpulv_O
z@%-Pwli&YD^Mi+|q>U9@E1UWV^pwF00iN{TN&E#p$$^u;v2tbLLKJc%6|?k-s-~Wm
zqvL)ne*F0fp1pX4`w!Y6`IrJcSy4w8jg`K<cCn;Nl6X&wibUC8&=a+%0$#ZSy$4;i
zyo$oBRG{~;n~;>xwi1pkV*y_mzpq>7b~melC*W)2-;<GDfnFU|UOkU8)26)qwi1%Y
z@|W5p_-+aGWQR3|D89_RELhlBz|7JN#%5G}6aqG6me^d`&|doZ#O~_o8>5$nV<_z0
zB4Fbf1W8{J=;chC{r`rZwypv_nffI2J%gokMiW*kDae+6F=iDIXmhb)(=u$|xegmQ
zuA(B0L{@qMmM_|l6^r&`*7Wt*wc{wxocSE*&R)jS#XB&?D-k~KiByd<sc5DnB6cpq
zk{2R0iwZAy75p+5V@k?gK3<5lf>oH7w;J(b(-G;P0)Lkv1bYS|(9H)SlYJ2uI2HZ@
zfpGTmfhUzxbP=uNg1MNSpAC<^3`{A=gF}2GrczBF`1A{u@g_cc<P=seT8?=Ig(%8P
zM_PI~((=LyJedI!j)dGu_(l4_!`}sQX^~j6W;PaXUx};(yAXch7~FP!2>VTEVL$&f
z7{(mNB+tFjvD*c+$;V+CbOK|XmSKQpE_D26!zE=Y5*Kbl){?czp(@UpF;`hEH9j#x
z0bXQSD581ILV`jN5-N>K9Q^%*VapcF+?2qRjk#qIgYc6@A!G?D9cAFc0-Cmt&~R|Y
zC@Tl5I%h)82}3LiKu2!^&<7(Nyr9GH8sp}KS9R2=^!lQg#`}0vuP@#<>4VqJUxj(B
z2`e3MezyFLw405OsImllgkAR&Dz>Kt3ss&dysjVl+)pipoXWxq@C0%KI|X!vUnf;x
z7ZqSPU%$`yJ>X}a{Me!-Axk`x0gbL2RcHCj85m-3A9NF%5BT~cv01GO^c3JRBr#b=
zY@2D<WB@~KoG88a_EuI_s?&c!PdR7Q(;8M&B?<J({{gsvhV=DSN`i73&#Cfi>zYv4
z(25$F^IF@2^6F-kRW~WKG-P3druHuUL?Hc1fIa!?ceFfs%Ioqo9{l=m!tOC{Q5}}F
zbh8nmoqh1r%T$#6w5`A58P(f!ey-!^KT-ck1-;U?hbU=%fO6ViQHhfD+#)vj36BpS
zqqbFcJ5xbVSqo25crSrR@C)!{A7BCA|9~FVoj|W%`pjg?8P#*AG$xEzs%Qb;1FHJ_
zY~<Rg?s|Zy8nvYjeVLP|3KitKJH*-+{{_DLGHTmV#zsq^ScS)IIPZ57n6$$kt@518
z=619YmI6~kPk^T=K0>dF7Po;4uR$4&SIwywi!0GHO-)gI1m7)LW%(ua?v~v{&h%VZ
zJJ8A5TEfV}1SV8`R&+r&viJtUCt+ph08>jF^s=P2wRKe~Jq;aejMp?)pcfe}(EHoo
z%1Rx!{GdQDj35b#WQB|lL2y(E!eSzkkdlln1$sqTymSFJZd#3P+ty>v%H@a*4#)J&
z8Cbh$8)gw=OJ;7snUf#k>_=y@ap_Kk`((n$E(JkTX2XB#0tCdYhELpDcqgobZ{h|_
zO<s%8M5?;@Lc}CzBPAmnYu9c;c1Aved_ysHN(lc(3?d>D;p7<r6W7VG35h`r6;$++
z6>v?@gm-!d{L<6l9Zh(}M#0w42aW;$$eXhOpPapjTi5Pl$NJ5fJ1q|dnaRk>h($tD
zFcPvNX@?^bm=+DM#8?F8<RfA6S}Z(q4h6^0A@1-e@ZbFj>{fmR)50?_jynxa*WEDo
zH~`0~C!y!D1%pfqG1|TW*5Qj0S-251*6zd1m76eq!BS)u%tU&2E>hTN#K*)SiolBu
z4MkKqVHY071}2IC4}r6bCoC)}*DY+6JuVz<9jN*oVC(1tBO5!Y85(1(xfQgWJ)ll?
zH^I>p;~YFOn(A((jXTwyAI7-{U<^Y)EEYzD>JSGj46-xC5XVV)%XkQ0GkO<q+Pn>m
z1as8NKn9^xPZiccE89v{*71zMd)BIiqVireqw6PCbker=s62t*qu&U<pI!p*8SN^8
zCxaPJe{09{-#YQ*FCBQn`0-~x{*e}#U{zHx+M(`xJb6giu>wAR)QZQC+X+3{;766V
zBG7ATsZ~^7YYP=;TO*&7X=kjw9c_GHivqfTMx#v2`XA7v#Z~(4{tkhigxFe9cr{dW
zwe`&eUJC&ub6cAg=t-1T)Y2vsdaa#!`qOX942(xS-lt_1W%isvYkB@F%G&PZPGbj3
z39>3etxD$H5@<4QO=fVkzxYkb?-Mmx$q?w>YwlK3V2YbNmE62`s>nyb{mJY3D;m1G
zQQjac!2Dn6spcK4pjSiS$xdcsZzZ>}K?Z-Mv7l0xDQ6GlnljqEc<q{HkpWuI2W*fY
zu`Fejm5NMkttdR1b@70T?V%_)h74dldQd^2srvK;fGXuDR#z2IAJwr8MX?i>Z34tf
zKCY+wqh%)aI$FvV=t+p(73ej}0tbX$v(jxrXv^GO*=s`<Aytx=@BRh7JF?2MOiiP;
z6{VM3NFn1+0pMZ>GY3oPTNuNH%FohXoRB@CN0DG-151j9URF*aR0<)muw$jrG{@-i
z`pC+gg@~wlr6aIswOtkTY@wy4t?FlzRg)t_6`LCo9Yv*=O6cXGXx1!3Z!y-cUyU8x
zH)GNKc?j_F#loTmSi5ioW~9x-fi1`J(aF!Sam_AF^-hD2Ll*r1KgRw7I*u!i+J((_
z(1~Mau`Fgr%gnYc$&xM0Y?+yvnVB((!^{rzB$-JjU=%a6OiA|L`?TBRneV&*{g>9e
zs=BJW)m>HRsl5-pr}t_M>a!8UNAAV&q=Oiq%ox1~aY@@THf1xWOkRW3skFY6$7AgH
zc&y#B8k=`*LDINnjGZ(I17i~~cE&tRSh5CFHf%@2$_<!(<RliKIgfE0Hlfeh6ogax
zMaINXH4VXlq5Uu{ZWsm+9)VTMH{ip!F5|`X&tuE#)mSup24)d_lcr6;xY?62e(5}n
zAPmQD+=U4{&m;Bddzkn9KQQ&&H|W3RWrVMwnw$Fq9OF*Fi3%@d#Ea;e@B&;Sx1ej_
zJh=8*K$W@$i*}sH=EKio89!$kfj4i_64P2eEj1OB#`AY2C1OlGfv3*j@uQGJTN|G+
z8d1@G;2#i*5JD~@vL`}AA`luJf#~SIe4<65tGhS4(Ast<2yJL-9SD8LUPB1JL9mS;
zjP6l`Vb^ygYzM}p+n@xvCQX1>@_0DMjzI9_WJVm?_HscZzm{kg{um->Md7NTS4dD4
zU&qbT>&PVJatOV=ifn=`3%OKl51>aymtRZhF{JJUz0%uRDB|k^+F1e5q?O{aB~?l!
znt>vIo-D5F_R6XH%BUU-3E1MIG?bTRp@LD&LRehH&oUM27U;>=O66Jdvt7Rry>u#*
zbSjffqxAmYpm$xJcCY?H=vjdGtLo!4{`P(t*j-T?c`3(q!X%Byr{VXj=>(pFDv$9S
z<M%5!xla&HM_P71atX1#@@kaS-LrH(xeFB1!e&s>{hn89KqjDFrQ*w|yoK!QdIGPG
zz$-I0SaAHKpd42;u9o&zV{~PWf6FaGA%Umq4T@@S;buV*u4m_%hKxUM@H-peJ=8VD
z3cV|86i^55-wpIM<Vz0>P4J|yAsT5@KxL<D19c8oHe{s}%P-{h;&o8}ygZG+rovMT
z?8DttN{X+bv{*LQ(wHIOl@Nqh@Rbz)fr`=_M(L?RL!r98ay~q@l--rZwT7UVMfhcy
z1`hQBq!p(cRV%J6u1PAPjLyHQHP@>4EUmYiRZVJd?vi<k9vFitMo8aCc!v7HBiI+-
zAyRzgSI<b=T}wS*gV2}+1XJY&gbhJAdp~sQ?uJ=ZdWnRdGVVrD9;(BifTzLhf?m?7
z(JYvwF*YHYODZZBqA3$+VD_wqSh92_))O+j_o}<fQEb|{j@Gpw)-GLv%>>b|bvtn3
z*b6wg_j$y}FGWn>#f(jqo}1CH-x2ihxd)>M9mLGI16Yu-1B;W^WAWH!STtom=Fgfz
zRYWzE8jA^wCSdEC-I%s^0g@LHn$sp>#+Hp3y=(~r6XFm>7|z+b4WC{69Y3buz~VKl
zFnH)-#10vZDO9ElrccD&$rF$?G=a+n*t6vTUVQdt?B2Nx^A;|^WWp$MYAQz0pO5L=
zcVp77qZqsUS;TIA11X2TMdGf{5WC?c3|dG_JMLL{4nBl{5hoEf`gwTt--j-N%i&0M
z7e0I;CNJKGEeDTd>;3~+xp6C2Z`g{(OO|2&+_{)ObqXe?jHRuo4JYhI3wmm}NJym3
zABFw{1{>G~1cn&ptu92tA(ZODQ3#-Qb?^>EM^9gL<uP5NdZA-PFWB@J=mw%&#6SXY
z1RVOr!EtaB9Ai^p8<zs-<jHVOOo4rD96VD-BVY>mB}AiTFB{$vkHUXS5U!M6MMiPD
z0bW`e0Z2QWRgr<*${gBQDg=Tmmk`UXp{*t8@&vuQbQIPTXoOzboop2A^+UibR=O#5
zaAsl1rD9O1pI#{~ZaG0(sxA?PT~%c+3m73-Als{M6Pbjb>gv=`am9o(z)0a)IwMQr
zu>j}g7;CF3A2T!6IoT3|-c5Dspe_94s+1m8-Tg)iYY28%uU;oC)Q|5b;c?R_IYIB*
zO=VrlGmRL(-^j-AYS6fzVXUu#AC+KcVJV7fZ%c0diPAejUd>&elY4~QU8I-Sn}!P2
z*WE0uL;kJ5QFP}o+S=RPUP2(|(9-5p>1kv}h3T|xqPUXhzM62Xr-HtXd@8=2@+xGP
zmYL+BSJN~PF+pvG-hHK~FrAR8@&rBAef|e}S4~m@WurD@s&lWKzH&Fy|BZ|+s<M2l
zK((e*d24h`f$5s^hk>1?4<3P6qJDZ7r6*f!g`QRMmGM|9J!^x8mQsBRfl%P9DCNVg
zI1MFKl@{m`x(sz*R#qJ)Cr~;nwc=X4uxPqZ>$u#X3A|r^`IklQ5qfh7y`F=6BW74%
ziklb&^@)IYL;%6(1OM<Kghnf=F6}NE)*zzysQ-dqXB&6Sp1%U4#*8!2Q$ouBh2CgF
zZ)|)r#>J;#EDO=N#0i)>aXRMAnosC0$GVLhsPuN=@X<qNo{T=xF_<=fD(0uo$MM}K
zjJ-`9z6||(FGFbf67=f3jY@JO2E|ZG4LX2@!}nut>~`!Jvjux5t;fcRi?Dd|49uN1
z4b$dN#rOpiku-lC=I&UAH79mp_Kwv^Ubz6V^9etyw6O$V?DT1fr-C|q;i4HcdFbE~
zOddB0@q>qB*0>2+NZ?JU#haE$=)|XD!^)jFcH{!K?>&OKt2SZMl1-So`3UCizlfC0
zCy;vhbxb~d8OeLUMbh>k5WD;n44?ZUqT-)N)Ub=_mv9L&v1j1fdmB3Y&xKvsEDWEr
z9qaa7z}_Qgu!|3Y?K`(%{iY2B-crn)I|p-T&fs~QU{QFZMsb;BZ0#tjyyVfe)BOh$
zc$TJj!A`T6se)GZcwv!!VDCm{?hydnu%74~8AH1ogLbsGt;2euGgV&a$iY;e32+&b
zOjS1?E~BTyK6wfpQc~f<a7~`bNQPTV0zo(cE&Dm4LCB*BpA&;C<tz+@URs%;ca;VA
z2Gt%FRb@7Es|goEFPkt^<OqBOUB0@f(AE|)9#(oL^GaP7%B#6u69uVL^ZgMS>ho7o
zlZ(n~g05VOFAJ3wv|fd@&$${2z`{f5DRMPFG*fkWYEjgFsyMaS@}ZWWpUcQIDla=b
z`~N5Oe!oJ6L^bz^pl0H_v9p5Rb>-6|%*=Kwxf^NO$j&Q49s!g|C|#rNR71u!LhlA)
zXW*Bf!{1qlD%x7r<)y3T^?uiro5eLeXLo3IZxduSxKU1U@t86xH-aytvWDu85GyXn
zwUP>46X?oojWsT#MJ`vXZAA?-xUabG4$5x-iTp|eF*CztP|-A<0^ik}7VurWulOt(
zXJ~0@YfUF^HTp;)XIn-;6jC{Bss4Crs!vlw0IDw8+Pr*1P1lBiD!Sj<*sAc#=m4tt
z@RHKAw%(Q&ss2u_w!fKWW!Viv?fQMdT`>TZ@)Pv5tf9KUU6+S?UUPvjkFd>5{~6h4
ze6Kn}tE0JKt$vulnC?lM`tv`Ni__}<!ua>k_o4R#X48#D4<Yo1_d~>xUWgvvm*5MB
zSGXU%LVQhKUr2Z;85V*Dk$pxXBsvy8LH!9mUvziyHqeVt8f&1Z*1>QRQVkg)At7*a
zab>|AgK?t-y)j6ROT_pwV=*IjD(23afyMmi>o%>zp8Y#;@X&tj+`bFFBKjjGb}H5_
z*o8^43os&TCWc0=MDKv*=pVWaBYG~!sF+2V(r+=A52qrGTaFzGi*aE5a%`Wp5bIJG
zVs0uSv1kRREM83uyBtY#QjxMS74x>N#HN!65I2)HbP126%3Qj2H<oVLjhXY;V%+%o
z*tGs2RxH|#S(6uG%<yqoF=qu9PoIPNlV@RO$~;UPy9je<tR;|+Vk059aK{DA+4~k2
zocJ2Ej(>%jC%(m^^FL$ep&yXA>Qf{w`T}v2KE!|#uVC=VSCKIGUGy4q0nYyG(1X7x
zI&M7{Za#-?ho8gY6VGG+{*&0bbsx5F+=i8_mt)?7S(rmOPZ~cCDPwsaQ&LPzZEWmF
zBv9=s3-t)DE6saAP@u_&rw31PScpmc9zcoi<{OIc9zn2`+Uq?K9eVabtH@qx6Vo3Z
zdMoeVNObKT3&(+@;T|^`Zb_-|m^26O(-**L(kysQnFrq~bI~JZ0$j$Af$ivlXcB3Q
zNBeh%|FlS4;f<}XD(Qq@M&&gYLMlDptXYI!c2zcQTN=0X7)E9l6$8~?E|py#qk!;x
zV08&ZrH8Mm%SO$uJXF?XqfoP?sdF=*_KIS`j<(qZm0n3nrWuKmn?ozcD9})}0@+tu
zSyjO23V7+p-lnm@t244}ty*oR^d5$tb=^{*chx}e$~9U;13D=>LCz9VcM5G6^nUw;
z%8yXX$}J|KtB^zMnMT`rGqZr7orh~wfxl7dT_yCgxWD8!6&ypEP?QS(kK8icD71jj
z1VNZleH(eV|3WFDXTWz4$gQUutRy7Ms&J#U+H_UPskwtH+GMrx7Ss}&rIiMFHG1Ez
zJIEm5Z)Rs0#iyCgt`Yv4g#8*VuNh;Te$6!Qs3WrK^Q83D5F|S*wO6QzNIoAT%2jBB
zpt%pdB8`wKpvudqLL<B?d96!Jr1btkd8t&KU}mwiR+XnHBNS^Y({T4zF6wJCQO)-W
zfLaQ4)fK!Cc<mK;>hrmeP9T>ml2%$n)il=HI!sM{_B6kSIxA~_6m?fp{p7E|5qkgD
z>}o&bm!JQMf0=G8KVZ(X`RGO9#SnHqNAxw&3mXuPsKLDu6delh03U>e1tWqyYY^2p
z(R57q4(v<d`N7WF5AznS#>m8EON)n;1q+($^@4+g4fIAQjz;3hIHag$G%<myBp#C{
zjlqmr6R~LNENtAmhW1ujR*vD&o}(B!U^Ff3RE+AKioQOhG01NUMuaTJ*qAk#(tkO|
zM^D4}@G+P_U@|r)F2L60Mc6iB3ARjGiq%sWV9AVyR7OiFVb@_IK{k5EQd57IIB6>4
z#!bY`B`dM>@F`50vlIy_GqG;V5iD7;3#rpqValYXm^xuGCXAVlnG+Xc@r)IiJ#ikD
z-6pJDvH|mEuEX5fo3U!`5p3N19M&Fw1<Q}WkLBlo!s3g6VCnN|n0M-TOx*SbCanFG
zw)#`VkAE8piSJ_Cv~Mvs<$YT7J#h3`hTvWsFk$X#Y(4xob{~HMhfZF^kt1iZXU8FI
zUb_RUm#@XbMRPH0_7qH}(oCVPRn}=Uqa4ra=&|FF#0N+$Ev~8+0)qn)6cWg55R9PE
zAOuH*AS5arK2+Rx-ofbV6GmGbgD%ni@Jv*1w2JA6_Wg&UeZLXtB<S@Y1^Xdm(Ib8`
zye7^?(7crhT)YNeJWswnhhg(qaXA+rsguzyX)s#!aYw_R?GZY+KmOnino*kd-_WZf
zM5<^Xsq```S$L}M2bEkF!Da<tuGQ*N{S^s(++SLgh3fh|(|92}n@{_cTXX~21ymjd
zgdkOAF(IoF0O}B{(Ew6*N;FVdNCnAIem_M%3$B!&pr-@_8g}--(0kb83Vshm@9K4B
zPPPIs{lB1Rx~P~&jGG2}vbQ%ga*<P5hFq$!tb%f6=9cjP=Fmba?RvUNEc!<_L0DeH
z^Kr}cv9t8cE5@&xMW!QsPSss1JD^zOa0xoK+Ey@%?)=SjTZbDZm9(?8w)`Jz*vR6x
z!uwV^zxtL@dU*t+)LubV4RT79zCOdW<lfXc<NO??Wvj8~8da@YxfRzKR}J(Of}ZO0
zGN_h|O0ph6&*aY|^ki>y)UAZjGijr8e={R5$_Xw(&nP{H6>|R@g5j;&Ea0zVLM;ci
zyf-T86a>Di@|#@Vd;q-~wcM9dF)FzP)&8xz0y92WjaX9nQhI61lAWPeWJ^Cy{T2cJ
zni_93xAAYk;NQRfOegmv<`H^*hW9tniyqzwz2gR=*N6e=J7OTk$smOHv<QdLkYGZu
zPXdA?hr!doH@Xpe4lV(hzjQ5PM~y)sZIxO)Bnv~RGJ=DH;OgpXQc5Kz#j=o6nIsLP
z6&`^pGsa=|!f9BzY#!FEUu{xL9o~NoD;KOoPrm`^=RX2r4*k)y$8d}an}~_MCSl%?
zsaP;@EN1r{frb4Ov3B$<te&_W3nr|@jLB;;Wy)$yC8Va!S;o)SauFs_@g=3qK*Fe*
zh#N5jBZpEkja`A23lC!DqQjUoeH*4vT~C|68z~bOU})@EOr16l<Hk-xV(ciSjO71W
zx(0LRuE2)v$FO0?SuEXn9P1BW!uAXAVbil8VZ#ewWA&w<u<7-iSb5<O%-#Dv=5F~4
zQx?2W$i0RM<KM%ynO|Z^>?NL~oqS-dK>uMIv2x=joVoNdPCfS)PMmuMhmT#vuAN7*
zdBaYuS-BQV7cDXhZ`!me7&k7(Bn?QQ#U4RB8%Om#h5%REsOVn3;TIS{>mNdut;u`>
z`F{w#$T0X4fDXQXu+f;pkY4B-)gSF5`{HTZ-F5_C=b@v}VNg6e4N8Q~@D$jM7!RjW
zQxPze3UuXGM6BM5prz{(vS>Ym=dXs}oW%%VwhaCYrondNU^MF89f7k3Q0b{tGHqf7
zm0aa@0*{ap^s1@y2)(Ro!H+=UwhTfovxfT!zRVh_J}NN+uh4W`$zUO++T-z+v^3Rx
z!dKPiQSDK=@yV_k)ii%@j?zcb(n{57K92l+buzwTqM(4Ep<z-iu&Rs7Q(aUR3oM~0
zdz+m_2<B!R=*iv+blKVbj#;v~>b^pycaxC2e$9aI+6_$%l+Li!%l${?33wK(tFSaw
zq(PaKa|%mQR8fmUsxVpIjPzXGxSomMsRT6|<L8VVT2v}Q+EUfq{hn8bpVJHQuj{$E
zl3j|t>bnm>S3=90TYJ|;Mn#<gouHRhRnKjAOpC5YW2AFi4*zG)eS4cu8=O&8j`V^O
z6jl?kn$I^k7r7<HD6OtUX;nGDTR!bLflmNt<)qWL5`Gzk2+v!VfJb|)DBwdxRyTvH
zO3=%e+M~r)7G$f^Q}+}NB`cv#l{GCdQN0|2dB5&Xpc4cYvb7bozPG991--gzDLa8q
zP_wALhr?odZ*nOhW}u$;L`_wW8KRbz`8z|_Tw|<heeaLR5%e^q?&eK`{RXe&)!*^^
zm0$6%-~NgD%NH2v^&TPc^+VsdK^Qn{7zU3Tj?ok8oZ?3!x@S)W1qPx)M6Wmmh7U1m
z;=9t`I=Tm8-jX#K@euT+_5{5UQc}><{1|bGBM?v6#f=_;#FThUoIU|F=FY&vrHip)
zqvpFkh*L+-AvI+d0-S>p?iz(?w-}5F9f|S%k}+fGXv`cu5~+O#W9Gn-SeZNntEVi(
zyoswYZQ@!?o4y58XR3vBGZrjbM_{Zb2<Kq>_*qCDI|nIovoJnk0g{K$!T8wum>9PR
z%VuoD>iN5{YT-6o?&(;xU=`*sSc+vUs3=osVdBK;NJ^Z5S+uuvR&2wv?I*C|$cxx@
z@dF%u{Y&h6<umMh{VSaM{3;HA^ar-R_#?Jo{3q5P_y#i;zK%I_-oc86Ut-R*k1%NH
zDTMdlfzY0-F+TM$j-L4l&%Z|CJ^La~oqv|VJBz)0k73KkU0AzvJytATiUsre{imm5
z!ldyg`KMAvB_xe8z|%u!M8atF?mq~=0fFXxM^M#;QQZX*ctH`N@a4nD+20RcJbhpr
z-W#?tgVC+mV6=}OfYvbs(Q!~5x{VlvF2lwUezdG{6JRrP0vu><{Rq7<D%;>C>k+hg
zJtCKHLCosy2w%P)q03exY~^C~-m(bp^M)Z}Q5>O{i>%T-WKr>DRk6TSQR(qU&Y(Tb
zsL3(#%OuD$st7)9C-^c5zKr|e%OUg%>a%FgG6_8bkoKmO@2jd4^a@Z?o`d`%Sw~tI
z*;s-ukD!yXQ-g&Roz>2ob%t6V6<OJ8ydW^CWYmyhW^DVvKrcI+zacA!7Ab>(vp~;;
zk`54himMM4o?vH*KN!Eu{&KtKj!nzXqYba1MZZJ1+@eaY<p0g5(o;8<bo_EN3%}+R
z;s%voM$KK^REupcm0o5s_vPacE;CDNQFMz+k19`=_6ilF>in)3R^m#YGA~zB9p6P6
zm0&qT!_U$PxHQ5xlfcU;ChQ2kwEPlW&B`PEO3Vn2tYUt4VIlI%N)7n(ON(%m4}k*0
zFRze_T)FdjzLYIl%bNs9ZUI#}&!_3WVw99dUZ`XD?_B?bHkRtz3_1JVwA`wjit6VC
zDuGX}v(^R-0Z$4~A;3x9N$m-AR^Z9<3WC~a0imUSPoJko4Rv6ypw*Sd)u<dL9Mza>
zsXR08I-LO2Xp)<<>3<mTN#S3=uI@U&;opD!jD;%}qJQi_D!T#b!=++i{7?)X6^n@!
z8PjIYK)?P25EMkE7apTVj6vw(8Hp}-J`bQboCQ$OQ~wr0PxVQ`p}}x-bEC>jLgJV)
z=-H<){DVUf-McRmlgDGm?1fmmbRE`GO>En|7kjoJM8dF92=xg-l)pcQ#zZ1%a33TO
z>5qvcMi5q`kTh~E#wO3g<f&^gangEB9=`$eQ@3N~%&l0Jx*8kjtipzwORzR|G3Lk5
zz@oSXSe(f9aq}>9+)T_N2or}UU_}2Rm_2C<_HW*d?W>k!<?QL0kun;KXU)b+D!ZvO
zmt%0^Y$QzEgvl$8V(Fgeu;JwE*#6uH*!9xKIP}i<xb)pkyz)aDE`ItmE`9nhTzuy{
zoWArW4jlOe+qQg+O=~{Gg6VG~cJL9z^xcBEv3s#`*UNbBwNG&A^|x{1#Y;H->_r}b
z1bg=G!|olsuyy?gtX{GNi{{bR&YX_%Q^q49c@$xnfRS-=h*##6<Z&1tpM+lh2O}&p
z8UcYpP|7HcBb2=jqT=%l4?_>aP#uu%LZS$~f#}wI2s#sb9eNHzhu%ZcX~0Ny<HN)@
zc0BA7Cc$y^G&m(shsWgkM%4u`Sc4!&<ciJcw_z`$S8qcwRlV=RrSP0L8zHOaVDSEh
zm~>(_u2vNgdQ=S+8OSE^vNe3nK#!m^&|`s8-5%|3h7@0og8QlLG8tKn+`25}vjCV5
z%!Fexk1tc@b#)#}D{@gti>2($CQZ43N82hDXK9&LrewlRfMaCbPd1<qw#o-*X~2*w
zpykcU<oj|SKu^|I(96kD!-k-jX@TCg>!vZ|npEEn!j8Zb*d7L+QG1L(c&z5lxW@P1
z%%Z(5E~Q$nML|V1GV_X!%DZwU9lxv7GOeu|BW_ZuWz^ln&C1)jR#;7iSBBrxi*O~A
zcDS(0bVycRT{h!-aTPMkczkIst`d40gR3EF#r1!h+<RrTy;68L3AyV6YHlGii_56)
z%5aUp>ks{|qGDv1mLh}DOV8)^;xaQo&&+;SR$Y$#;yj)&o;ywDnV&`AU8icjiUQ@)
zqsmjmhFZcc)d+gH5Oh>pCB<ok9xblw_o(>PM^CM=)<y}bJ%P_^YXv^3Jd2{cX_f+<
zHLSoB{B&Gtkp-f%x%y1ghmVS{l8UcXLml}YRHv(lpAr$MtCCgeX}F<f$f2dq(h9xB
zs}^I>$iWyGHy8uB>>oP_17e3@cw!vJO__w1Yu1?TP+?>mm0m2B-e5Snhoh5?7o0sq
zjJ+M6FouOK(&T7UV}{k@3VIp~I%Mc@xVd|wW2dfgbaF*RL@%VIOu=$OZylkxd&fb{
znm!L9fmAAfzK9C*!>~b-NQjNb=u!PJE;*hyaT3N(oJ0A!3}Ys2;OlLeGj10x?p|yj
zyA>PbS7O76Mc6oEA=Zyrf)xW7V&%YPSUGGtmXBG2*`uao+>k^}CNO4=PsX&dahNkP
zfzX?ey&G0y-QtB<FmnM_<1$Q~vl&xX9>?rmuVdli53us|M_6|JO>Dh%8D~Bw<i5Cu
zm%q4<H^01rcfYuTw=VyPH{SUcZ@%#ZUV82a?BDST=1seZ#Nqq+JGNli$`^3r?5BA3
z?Jx228*k&K*IvYhXV2isu|qg~<RG?h+e}by#QN1Mv3Nd#H+?$o`ec(iMZi-Pff5O*
zJ}+rZ3I+}xVTQ>B1W{3ks-;}j2^Pht&XsO~!Laf2hkZ~KT>A_~m!1RAF=h}|o*FR*
zqf6gdbm<?5E&~%`A2$(B0^Zmea3}BrX0JfV{51fqKvTbF88mklqE~D|uQfXmwtN$U
zm#s(G>J127z6?Pt=OXs#D$F>y6;~^ZSfGkn@YE-dR#SC$M#*I%U22ceGvHJ3b-G|k
z$Ys<KehMC&O_i4?=y6{G;aEt^TvU;b;tJXr0x!SFGQ3P(C$uamPy+^GNn57K$x+t|
z+UomiB~4>*b@S7%Su!MBpqF>QyNb0=Pp!8PLobbL?b>y!w(C-OH;mFVv!HRC04LB1
zhz5#ghBO{WFkVYbhvv%o{brh(ha#h(2x)l*xSEzl;Bnh^o{QXKV?(dgj^5B*7=&JC
z?Vm_1y=BJP{(7ST*RlwvqH0=H<<Bd})k3Pa(mG_8YwT?uu4(vLRuQh}mm$CUHY%wE
z)z?q;cj*PC_+68YGtvtPxe}^Beh<xtmQJ;oQ(l4m$|_`WU86OWHa>^8x1_QR*#-RV
zJU<z-y15xh*LloU>v_I;h#2L}hln-9NiJ1)LH-Q_Pbs}L4qBa&uhDY;fxK*WPm!{-
zbUl$;vo>fbtSV1{vnsoX)txm2M1fC=v8vo+a}5B?(hT&JT1?3etkBc<vNmMsVW*@L
z0-nI9E;@qVuUCG-(p5_^ByI=>n;42A@xvJ-=vd=1cEWgU*}elaX3jDFTpL979%-Ow
z?-GIz-8|sr5p0y+uy`rG|3J^G^jutAOl~({PY*cQbc3_KEyD)(-MgT7&nPTexB%O?
z5PCazW59qx@bL7eS_wvUOceSIh{3Rt1Cca-6vj@QfU#4jV*JFpm@;+=W+tw|@}%w9
zG-?mF58s15gZ5!huRYimu^R_^AH=@C2e5m<9&G8i85;+0!m@#DF>k;U%o#ijiwM6N
zaig$o>O?H&=T1&e!n|1vXlu74b>23lE;)qMHD@tz&+C}H?FGy@@D`>Ycn52r`x-~y
z`zN0J_&2=u$qiioBm<v(kdDi5U&H&a{|n!KmVpo6x{BimKf}r;FCaN_7p6}=f^%oS
z#=CF-gYh}zQ@r-_+c<afEKVOeic^R7<KXUX*t}siR;^x+`AZgJ+ML;#Fl8#nri{nf
z#AKw<&Z=MDm_#LRNWzE_BN5fJH-Q(5K+11*4N@zUz^B|`euSWdcK~dB!{8J#5RScv
zqZ5JGsm~Cj@;dh#3cCTLU^^fYHiJgPF>Vs<M@@mlm}&5svH$_J^S;xUz>@&<qhb!4
zvl2eDmLYoWcEr$rN3ACWdCi6%*p9wC7h=%*8Ms=(0##a!JoU@t&1;2TW~~5cfnEkx
zn^a#qBcqnfI_@L*GV5u53A;=xz#IZEk2W};(90)43(Fa0+*X)@d_qox+!gr>0#Dsg
z6uG&C9z*p~QYtF2$_!#cH~xL-WoHqXgr3#j3VJy?mij!az5NgLEa0<F{b&s<4ApsA
zaZSrK?j!hq=k{N@U*NlvK~+cPXQqi%mz6aBZbC^D0;N^gBaM*CsQ-(<ol$WY*K#Wf
zBEqM*j?lkrpr<4RzvofCQTe46)l%uzA+5NE$5O%NmLi*Kuk6-esJH{u4Mz2MS2A;{
z7^zls@~QMn2|GfR@Jpxi%Pz0vHL9Wt<Z&51hAKC=m=7!}zD%Bzj9fl|cs?`p)OeJK
zqVhr%mU7AU;u5OuLRAIG+N!nors*mq=*eal=3ga{)Zgxk8CoXj2~dif%5(!TtF_g-
zT5J^-Rd*A$bO$wb3Dp%+fV8<>x4yPmUrWP=z*nhp*y<a||7jVSqn3LM^sGux(6a*X
z2ICr)p1}9p)qi8z>ZKS)*eQmL<dVR%WEoFD^7!%CM%T4&{U$_3_ZIYIZwJEGDFE#{
z_kfdIpaptyNe`fBt<wt*2|<q@JrFl?1l*l%Vc)ea+?=|@)u|iox_3aoKG9gbXg-!x
zHKdFikFMQp;O^-|;PpY@{)0@x9v&NS>et4Nn}%_TGcY}V2^Pm~po-g!P5pLZPhUbK
z`T&lE9>S^6<2V&`3eUwnkJHiTa6IM&c17;Qw&<<cF>oI?4%m*3!#7|~@9Ee$X$2P2
zCNE~8o|U>3Q>LxQ*wk&9wfb4i+W0C`H@t$xW#=$y%gdO)=Pj&0^DXwg{BN9k>leIs
z`6|BrIuBo7&cNl@|G?Mp-o#IzXW^|EzQ=2qe!$_q?_u`zgV?p>O?>`Y7CyQBD=xqP
z89sRXeZ2P4n|R^E%Xs$mvp99|2o7x9ik%zRV&mGCShR#HbM9<Rnl%$ClP4o#)M(R~
zlFV~ET7B_UO^~3J@BI)+C8zoZjeIe%Q<Vaj{sdqTT3YqTa}4hbmp&uWiPx*EhLy?A
z_KQVVsyxRb$#5K&0{h|POv8ms%51nzT7V!bywFAK;X7j)ym)T>W~k+M8QciL9@7@W
zb>?D3uG@r|ts4=tZZ0}Zh(Pr6QMg76np;+gJgTxh!Yh{$%O=1CJt@1a8Ulrpsi-CV
z2)zvLBlt2EJYEeQ*?KRbmnZ1a-U@sLf*w^~excx}t`ix^$x}U(I?EDvgkLsaXE8EW
z8BefDu_$h4C`}Y2P4@PNX~f85;TH6)Dla#e-zn2#aRt9LDmDYQ`=LpJH1(qi+T0sl
z-cVL$f=+Rr>&5~z{vZ_p;C{_RaV1?-K~nh<de@a=N}ZIm@~P^okWPgr;APbRO?!I>
zc{N-wu0?KXJt}U&Kuq@4nswz`fdzWm+>yiM(~D(?HM5#PR*M{^mMSbW(7TyWi%g5H
zxS3BYD}_g!omWM>P0O8Ip>erXezdrm1qB9r*E7<Pp%(2z!Y_~4g7&t&rVKT=t5Hyz
zZ&FF!x?P2;>SEI}uDa9`s_%;O9MdseN>7<@WH$vqSx{>yV=XP+P^9844HH_g=lxN~
z>n%km=+)P%fr4tUTCfxJZlJ1yDzDUHht-$QQnzQZwn{dtZYvs<BLG|KcW>|-{;$eo
z@PVg%f$E~8l=v&wEH@*p6hlXH89NL~$)gcVfi`aZ1Z>^51AF!!Ky=SOXb=`N82%xB
z%t(wj9qeK2=!J!<(~C<a@RWu;(kL~hnNs3PH#ZNA7(Nu;yFLvU=l1B~+zA~!K8@CG
zTfx=C9liPuFjmdg(-*$}gk?xi^x^;LA2SqvBL-nazeG$Jl8ObxmSB1RwOA9g1M9=~
zV{711?Dao|V*zJyG58!_3O<8ZLrybJ;-!!ics`8qi#&s8d!ELH-Y2mqcpLWi+=;z|
z_F_luKCDgHgC(PPW5$?W7{?eh;UK0iyo9OCUd8maZ)4)B*D;!EZQADdux$Sq*mB@Y
z96b6pF1`3~eE4A+KKbY-F2DCXK6(E)eDTrmc;~gRX=^Xz{HeEa@yxq;>$Pw2<)^>n
ztIvPK<qyB5!utTPzxo<3J@*2Y-+3HAdK?G#?xV`xj16noU<s}D!bJ;_I%7J<j2nkB
z>WfEJ7auQcJK6%k(PJ@SAkV*l00II65UIR{5n()MmX4IZJnt@nL9h!5MR%UZ?vVq~
zC3+}2MG<&>SUC?$LbpDIUcY$Q4jl`-5fk8+G8cg}S0Q-rdW0?{%vSG0ueEzHV9P;7
zuG$9I8H?dPe>Hp-u1CPq&4^gN4H0YCp~u|ucqSzjkB#nzs5O0Xt(H&Raw;(mIisq}
zsijpU;Iaw4Y(h-S9D*+Ueq`~r8aA>7JKp>PqzS3LDobQn5@xC+ETfVuW<gU2S@joV
z<S?@NI+F!3qkuuXn@-!CZlI@DNF@$Xa38~(7*y&mFE7`G6rP}`rA6)0ex_?SG|OZ_
z)=>SPMcrATcijYGcRvKf8w8x{^fVo$soSGkyvn$8Q`^<qIh*T(UIFcG4Kk?Y1iSR=
zd&s3XRWgCn+kYF_$*R^c)KH=BEZ3A#g~~3i<i5pa<W$`;b$-gNr+IXL=jYy_ip(yn
zGL0Blb7+%u3yiJ3PK&Mldf8>=w7~VIK_i{8E3B+EExB0*Qh7WV1-ZDcEYh^O1$=-N
zRTdGzl_;(%HWs(Ku8hI|q1N)E4BY**3U#-XaYfyP1fLsJewrVH7F2l@X>-+$MH!LH
zilx-jaqo7aX_Nl@UNJ$JK?Qg7zrbfvcT{?nw8Z6Vuuxw=OG~egt>*nuU6J_!cuFE`
z>6mV5#4`1|nKx)}dA(IX$p@b1Gqxm&<%3DtX0HB@H5=9-cGO4$Z#aQB9B~Pnu#Dn@
zkvw_~c5dH^GiT01iI5wFM)!wLP&7Jpal+GWy1~}LgV0-z!NW$H!MTE7i0l@j7Zel%
zcQ-GLj2!`AA7^y#)Dm`foe<M23IhiXh93*3y`wuE_!*w=EV%wLi1zP?et|>L-**@W
zdX2!Ska3t2HVgCpmtuwYTCDfphMm3#am4>5&IAx>e#h~g&q2K4yBDtn?8U3W`|xte
ze!LiV1TRLO!lfueG5QqF_CA3_y^df{|6|xT{0x>2KZ(hM4r27MgGfv|iP4ju#iaRf
zAbIx77&GT3OkDg1rmuPr^EZ8p`K#Z>`nB)ixzqo`hi_%z<M-0=*+)0<>4(4L-8a9-
zo3DL=mo9yPw_g7YAAj&2zWC$^eDmc$@$Fat#AhFWgZJL~2(P{R23~l9z&m%&^hZ2!
z-~hI7-;Q<b)=}v$$ASe5Fmu){Oqetg$thz^B#tKZMrpiq5|YM_L;u0U2)76X1&5e^
zkYS;;w#o$-5sJ|0NVo+C!q(T{G+ZcoK*#WYXd6BN?W2Ym=(!9T1Bbz*VK;0nY(|Vj
zml5O9WBfb>%@go8!*BLl1k789h~?WbX!}tN+I0-!w6u{Mc45HY;~2Q-1cvN6iGJI5
z!DsnYv`qF!v&5&+duudq-2N5$e0t|s<q~=Y1e+osIRsly6~RTI8Msm1X(=nKFzQbG
z?yswFpEWYo_ph7^q>Kfzn82c<$SLLVj2znDtTKKERbZz22eJ^R6$pH4J+(m3WK3aX
z(&{Nso?2<8>hkjPOju#}{{=lO_$;vd@3=w0T^HmSR@mL3tyNs-|J7^7&CG0ECHSuA
z6q$Ottm;3JUU?5`q+Axkr&d_aj8;;A2bK5!M%mrJky~>cH>vh+lxliM%ao3)+smuD
zhvM7ngnSQK<#h&rSyX$b^|pv=k8vZv*tF!z)@BsbQq#^#-Q|{-Bb(66EiFZ9O*NrM
zRjBDN^K$sWAe5<w(`j+@sP2jgyRy1+l-8Ef>Z&2hl5{}OE8;n?zf+FtTEbCtla!>P
zqB0xhROThLoyvJA@G1SfmI75R_ur{2An;1@=ba)dy=+rYX9;UPo<-qV!B<srmGF}#
zw$$kfdg`QX>hh>AHEWvc^9p(G)d^kwF-;w>enY+<XuRjLdCzD%07CED4a+dRHFQ@a
z6XU7$MqotTNW{g*8{lb#Q_7fR?A^5+2lpR<GQbmh{ow5%LFhRWdR<}f<cY=0)?n!H
zINI89MDU<c709qKqewhGd@w3L3B9BHpjTus#6<Q%cyKh_d-%Yin+x3Ry%FFV3a{?|
zh;X9HbE5LIAC4j2;}B;v7L#0OW48M;Ece)ob)Gx0&2J0#25i7l|Ft;nw*u$=7vqBe
z0-Oz4i1VR*9lil)LN?)S*fw0~xesT19>%fgqc}F;B6jpWhxPr>V)>Bsm^t(;#tb=u
zg!nTUJ@Ey^j=P8<$!9Py=>*11e*rTVy^U$}UZ-_`2dme9ghPA2#&f5B!YA)!;j7Ep
zc>j%G@ZK9g;QhD1$Ez=0#z!A~i*LUA1^@c#D*pN7ANcND0`JqW@&0?4@y6?K<HZ+W
z!i5VLaN@)X1H7F(cVgqljaadQHhbYh%$hwL6DLnHup6B`2BVS^k)YY*wB*H52V@N=
z^QFQIyFc-g6rSo$!w5WYsys(Of7tp5!670BwtP5sj_Qw&_Zuy`_m6|~h_SF|*v1lg
zu_@?0asoW2E<wbyZ3tbw8Sb?CE|V9*Z|)lO+k6ls_MOJSok!4T+dhmqb`sHBb|HG*
ze!}nwh8^9GE;FLhc+}&F+!&0TcYmjK$|3ZMX>Vz{YKjOrf~7|FdxTak0i}Ao`(bIl
z<u-;<d)lto_u*&y_%YO}IfDv8sj8&-vT19wD|r*E(=%12B9jG8qd_uC2shsRX}tMw
z@=2dg)nZY5761xrf?a-ozJZ?=c3QU%MYB!@^nZlZoB;Pfz%wDxF|OYSUYhC!83LV_
z8990UKALkjk02^DDlMn>F9W>mw6iw|GSf<1sXlfV8>)46J~l}O3M+AwYVQV@8B~6n
zBSV>4it6s6fbXlg2a{kRud0r=mylDJ6-E|SUv6a$uSpGIS7BOiWoa`C3aON--YP3l
zR#S!S!hEW{JOe#-TF$4+E2}LxDzBub)I@1jk(udEQ9zYlR+)!NKHQXPC9gm#>k29>
zb5KjDRS|C0RXMb(`LwFJgjOc1_<*XZ$VMF>iY9oksKc?s0x~sP{0Ds1mDf$k-d2_=
zZty)>xK*3W{h7u_8*5u=saGs5$tL&+x?DbZ^D<FTm_r4sYpv@{@DX~MwD335uH!1f
zw|3Jyj7*Bh@c3Akt&xaN&@{D5zMF_~$z!p1_a3ZXxr$0JlJ+*LFFbui(W#pgTDR{G
zCl_xlS+?5Ly@e2TDv*MnD%WMfe5pJ}#gD=0xN+zeHURDp-mvZD0NW0>aI@)w04E;=
zJNUt;y*na1hoE1l7{qp=zwSa?+jRowI?lyn=fzm#z6NVOmSVlvbZqoW#x}2b?D2`m
zLBAv%@=L)U?@2fqG#`h<SL0B`MjYw69mo3}z~O#Ju%p*8tnYme%lcfz>|SS)5`7G@
z(T6d(_Yn*t_y&zUjUfqVkvRSmQYODh_*}x23C|*R+y%_#`tsSY;NaFzaOTjLICty|
zJb&g(+SwoR?i=6XgLl5gcVGX8U;cFiKmYS8e)z{P`0C5=@$tu>;r;hN#G7xvg_mD`
z85b{J#Oc$gap=$??Ao;pn>TO9nl)>%NF9*p%)yk@sTeyh<pJ<w`SdaK1yk+CCB!4T
zXN+m34G9Z1=U8>7p*q)upD)j~orfnJ1B2ld9t}G_T)Oh%(lurfZ2HH-c5nimMy9}R
z%v3lfO@>|KB-kZSgX@$92w%Pxeb(<sIKk&nmFGQc6_;xfwQ4)n-eL6LMw`2CD}vUo
zLFk4}w6=Q@LfhMS?`GJ}AB4vybU@VR2;8`Ph0x<oR;iA)6)e;x270-*RA{wS5`tcR
z4l?U3q53@4_bIaYI;WmDCnKBdCS-$E7swku(^y?uUaCeZM=3_BMFU0J78aTe!Z4$h
zDz7LFY56yClU7HK80nce2{Q}$1h@j)x`&~s<^Mm>yO~A>$any{`*D+?OOujR5O`U%
zk_9DYw1XA2<0a;GMsB`o%~k!H0TDr^=^d|?)Z+JoN|P5)x%6&Op=D~8J%-lRfT0=F
zlzbq)q#8F1rQ|Bih>Ox&e<H7fs*h2A$1+h+Va+XLZxxzzLs?cT?*2vP_!p0>q)Mhm
z&LiN8XlV-z3A+N+-o1_6fAh173#pJ*jll0;Rzww8W>j8waSl}<RURWdKg}q;N<Pr)
zZ<o{B5{9bNQ&9Qcy<2Xgl5ne_60hO?P^pF}TF{yb0<n_7D$ho#Kt+{SMsOJgSe4Dl
zVhDn2)JP}r2*C<9SO|F6xs7)ERssIHTSA4X&&@QStz;9G6-qdf4dqZQ$XAO#qad51
zhZ?U5ftO|O0Sol5rCq}sLN7Kk&NN`e(%M?)=1w$ITTB=?9^1EW!^8;_jCgAh92NsN
z&p>qP?u@n_x}%4yHx@2lj-kUwATZdXJT%dU=E(>O3L^CUFnHiF_<IDSM|W2^cD92{
zcU!o2?+UlBo#4Y|h>Zgxy7WNrPTm;UJ_t!&`(d)p2&8t2!vfo6EO8uzrEalU?lk~w
z{CZ+TNI16i45M=EgY7*>VN1v)Y!6s~J)!HcCvr2kM{mc5n7vpXa{x;s_hEX_Zj22&
zgoNN@7#>W}g|IM39>w5(r!jok1;oWYi;=XvaYK${eB2q#9s4{MCO?bWvB$AA=`8jx
zeis`TUc#aohp}<RS)4omAzpv+OWNCi;+LOp;*Vd`@$<i~5O_Zmc;69tpW=fLKg8Sb
zzKxe&c>x!nJC9SRPU6UsBiOxrw*lVTwQI3-=~4r{Y15`*JWueLvB^l%JQq}W0-e@%
zG5YrF$8#BI8Y?upk6;%NLg;Zlkm}FT+ZQ%oesBm1ha-WfbWvTS`k{O8!LS`bl{Y93
zc5!3TBV`&K#-_r4+;q66E<n)I4d}UkH$s+eLD1rjh+ef5F>7|y-fkrfH>1ynJ?Ot>
zKS8$z{;OBQpOB8+xDUOzA42TuBXC}xgvTd4AY^?{T)%sRg)yJ1q7=DREG#uF@O;we
z)~Vz14P@05Qn&KBv_xiIu8C}}D{^laAh(`&xQ^<OVO`JWlU?<Hrf$%{PRdXUk&vsV
zs$=1}NrjM3E1XuEMx{gO72KrS)98(xCd_DqEUEyiyh3%%y$`*ILn_cx_ovB$EcRAQ
zYe*@krPHR~2cKYP1zx)BEVrdo+2zwpR@0)^-T4z`R9M*r-3=)|!KSPng|)X$c4H+7
zxLQ<cpr%w(X7caSD$9PtOY7GOIQ7p<qq@^fXE)V~OM9GFP>L*dc&43I<X6-pm!Qij
ztE8g)&$8^+T~j}($maKw!qfLCuBxV*D?>&efl*(JyP%z}t|X)jP+UP}#|Kw=T?Hzs
z^zzE`QCLyHeFREz7J*l6>h-E?i)m$f9`mX4a<3CO8K|i#q}r3Z(lE3P+`d(Udw0sH
z_zG!fD{#B61Qmp1evZYKYRXPc-KnUnq57*K7^|~sZK?kFK&z^x?PXNb)=ClHyIq7o
z@0Rj73rv-0qPi-d$M8E+fvRP>u#lf$kcC`=FPjiIz>~El@X|BX=yV-QxUh2L8Vnm1
zi=lDDF(QEigJmv(z!UhCt8&$<RTwg4h^eY;5Ev2#XICFW&k1eYcY}+YHx?P_4L86u
zgLI>4b0fp3^!(AIhYNhXe5vki;M~m)9`?5Ibm#(a$My(u?}9Lw&Ist*8j*xw&yKd}
z-NpfNUHp*JJronVgkYv!6z25^#{w4Ybv=WzZ+LGUjO&YCBYI)mkb&6LKM9-qOvk3E
zC0G}}2FrriV2=Mfr222cq=2265U?9b{(CTa;PV*O>pVt8pT>xoGZ@<Q1o}rD!k{R^
zu-^$x8h#FQ6Q9R|g!7o%cQ0o3Co~gJV{OtYtQd0yizgkxf$gv1%#rtS;pDsc{PNGp
zxKWH7S99>gcR%C%?|#BJUw?-$zW53sUA~NW-g^hHyhcm=+*zDDbDY3CjD7p|Ve8hd
z26)zbyy?@YW6~sxt(B@1@Dx&fia|pLBOs8LmgiO}-!)0M>P?mMJ%GS-3kpSNH!nEy
zx^xSQK$nnc*v1Ti<AC9?A2<T`gGRz(coMn{OM=~)DR3M=6E0Kd!E5evgs$9z@YUPl
zI&&GEr!GPmVHdN87IxJRM6Tre+8tDThcI9#)!vrv2-&z9{r4Wj=nF4k%G<A_=b<&Q
zS>7K#w~xZLyBUNYZ}_T0<W?6O=w;V&y`Ji!K8;|bY7^jY5oQc4?6j^B<Q|5eH7wA}
zF^bRFUSo+Zz*D4G@n)`I2z;dss)uyitqf%Zq0O~G&oU2&RGy$G_z85@y1oA!^sJ%R
z3PH~bzK27=v&KyVFgusvD6cfYlj0Nfr1T0a38C9}Q1uu7Q(sqVNe89T6&YMtLxvPv
z#a+;r(yCJ3X%xndLfTv<EXd5m6%Aigdigv;vY>>jwip?OrMM>RoL@vWM;lur7?z@t
zHo4-?J(LrEIs7hZxwP3ixhSC3&7;*-$K=BDQdHE{(7slnq@oPfw`y_g&)WpyJrfo6
zw78W;sJT^+JAc*?kKFfHEvk8aH6f5Xifee<E$;vO?>Z{IDpXa`;u1!+grJ~QM$m!#
zZc+W05{^Yw;#K9jxO=-4x9bT}+G1mYE3)oGP)oh0?Jd{H3nd~@pT3+2z|)+#%AcsD
zvC5RK@znxeVG*x$p;FoNTI6Jy`aG#U0Z*;?*VC0n`VTB$PpHKYr^_3P;e=kC>dsm2
zl17il?Afz1bLLFM^y+0?6`>ai2WJm->SB*)+H``eyAPHuU16Xng=ZcA8LnYazCN(E
zwS`UBu5h;L1}~@X2={Y9OemFIKu3glw?eqrGl=qRiwLJSh_Y{ws4i{Mzk4Ud5{#o=
zx?*~OBj$xWVo@(UY>o@SzHx)Fi(hg>p9HK9V?>Oiq8o=fktvuQLW}6P5XnAkG2VAO
zCiow~#NbmHA8`(&B2Qpg_#q5u#77@Re8fS-h3vuD$OD)?@D%0^IEJ}>4`A`2gIE=N
z1e?a5#?Gl1uxIA8IKJi$yngN*Tsm_Zm*4&o*M7~%%`3V1`KLed<M%(~pFjRg;C)Nr
zeM#VbOojIjUVc@;JBO2Jj^gO?Lj>O5|7y4}^?0*pVe;h3NJ&XCz>}?&veQy3Z`g?8
zh>VVcUw|Kh7h*!THY6g#C_MMzFxdJ8qMKg`?81AYOK4AY3hRaLy#~Q<zzEn4jD^F{
zcsRu+!!~{#oW{+7+tm5+nX?=|^H(8o=|+UF*#VygYvDFy8A7S}!j^7E<ce+R$?cJ=
zw$k1nz|j3C(R=4%3_o!Jqc1*>!AFmwH*Nahb4M}g;&x1bV?VCnqb(w&3TRjJ2r7%c
zEuqq*g{-4OqjeSZtbnsZEw^3@uK;;k)^VS<Jrp_otZYKh692Kg);hmT+MY}nj&!QL
zj1mS5LuLUZk3r}Ocv)0>N+TuUS)r$}T3u@+hU~4t_J4z&H3UYblDC9Zo<-?pWaT`l
z+tYexcCJ|pdWE#QRfJFN-<B*W<y3+B)%8$X_l%M%f|}OT^tbyPRri3Bx<Ae9Osc$_
zw6@oBiVf(lWn|;3R3$$<Th_U-7&i&jn`)(Hq~*~*6LvX-URH6L0bqgZ^r-ss`5iLu
zcTvgUZ_eQFm%YuQy2~#vGOf3TrNvZy6{x#gPuywFv%9FhQ;Px(FXTf+b$J!lg(xoJ
zHRbs=`So&ca7mk5U4%P#Dsk^#wOQV#{jIO3Dx)I2#pT^w6;x}5RC>8)`6rJ7;dzJq
zDoV3W-JFzP6|HeCAD*fU6aY2UO!aq~&|NLIme;h|rbDzEFt}bsg;$`+z7M?|V{dcw
zc;Dq(Lcmk^C1scS?ZzKix^@Ky5_kh+hfwj2H2w1gKDn(0ESoBS;gJ!>RW<Mn3`ci6
z7qn^D6;HHm3uhNk1HA!*1{tLn6cWIKsbNup@b~wHgM&S6x^+emyDkXuutj8$JtBg-
zAk@D#qJrC?S7>`g`?o=#fX)~gV1vFM9nsgd4F-F(!sy_3m@~)@OJkj}G07jB62q}(
zL~ktV(;rL1<FGtD87rd3U~zOZ=Eh7!YSc`mMy|uGp8GM2_H|0&VT|!UfOzjc81A<P
zabepqI%+q@MC`;^u1}5Lg(U-zVExEbSloLjmiF6$?PE^hkU+QeC7fLIDlY7N4<En$
zGd_Fw7yR-)t<7(I+WtEozy9kF!tP)A$9Mn0H(z~?&p!JMAAIlu-gxs(TH2S*e7R@N
zpEkfdaA+Taw+$OMY%mQMQg{O1)TvVq@YGbU)+51A@KdNxFFK|t{7oW(P}*C9FY<n>
zsIV}2g@(hyFNnYkhJ8eDbPA3}x99=1v_sK>%Qg{x(UJDnhDxvd&{41-H3{}(rlH5g
zIdGe{2o96yp~v(k2&Rn<S+NzpHy)ssJ%k9Z_guRReK+kx=!(sVTDJoO_Z-K#OYdUf
zp|j|B;2;JbJ%Y%6htTW92@HSn6xM(L9<KjMWkIMF)~D0H@#)H&J*T#m03^_4V+BG%
z%nGzxf`|53`}3&?t)XrCgr6eM9K&O2eRBDPS7cWaP%I3Y6$Bq6i%<P*My4hsDyEg8
z&B<l~$j;+>j%;nZsnaVc$Y<dwGSIWCJgp0Ss?W35=?QEP1JBy!<>B@JH}J{sX3)xJ
z<<PzpOtQPVR9~f3ebsmWMD;yFjlk2y-Z`ZfdzxNQh8wwpS25LB4XxxY<d)PRr??uq
zrL{b+hKjBX>A9+N%QX!m*#-O^gjg=Yr^)99!Ax$uA+<;F<<Z^>ba@Oxub`re=cUNJ
zued^)UkFHoKChUH+;mFLK@QK4lwLt;kr|R!d#4V!srddR{wiygDTZ((V2Y`#G#W-3
zSOh(F7%wX?ptUWf+N)$#pqAIYii*FYA`f-6v@+P$REvuG+A>;HD#kwvJRVz7#_OQE
z!h&?82y3bf2+3k&jcaT2P2He|xe17Zoi&OGNsR_6D&+Rk926HbigF1(Dp><Pf;nH;
zB9jjvKCm*C{pKdF5_<o>{yP@0S#E}%DFnS(EA;5N7B5|b!9#~2NTZy>Xm3@Y*Qu)`
zo_eMeo@&(*UOqvD-bxG_Jj7Bs??W##Dhz)9-ssk?3!EJ7;AqnYZjN2x;@Acuf!)wM
z+75kVYzV;i7|_!meL`%}Kg6D@uOoVScfiQV?wA?xiPcksu_VzCb9)70end3pg~nh}
zRDa9}>xm@;;;^KD3g$;m!h)b#SQ)Vf3j)?+PQVu0+g(WZ*o+kKjTqy#3bT4|!<5K1
zm=?7T3kPh+%AvclZp2<}9!Vf2?#J4pn{jaJahzOy5qlRL$F@aBaAMEPc=z>x;JdG{
z<Jxbu75`4d@Bg}rUw-;G{`td?`2O2(@YR=}<Fijc#)lt%fVba%8!x}|DxQ7*63(1E
zk7Fl}W8Z;&*s*JesmEKnawQfoUX1zk=OZ;W6=TPa#c0ZDK~MH}#K>4<Y4wy*swe?Z
zPa8c<G=-ziYXH}kD#|4&1h)Pm#?p3==wn)HyZ0TAu6;+KQ=egI7c&^`dJRGMA@PJ>
zBAk+@!gc&ycuZLY_teGkoV5Zz^VT3>(FTOC+K!&<_9Ef%Ma1qugQ&GT5y|Mac^~?0
zKZwBxP9y&GtLVS~EPC%fh~a0?W8@2OqW}5l(d*0+to!~WT>q;Ag}3uibn7O;M-@<8
zNa&RzpK8wnG%7)<IRZ=|E37Acs%VFKlV|ZszyI_Bj2Ih>1}z$(K~qbNpEC|iHZ4Xz
zZ~lB*r7JntuwnNmj7*6~gXTP@NdxpA+M74<BK&;iUsOL-T)9d)RR~RX-n>~;G;Y)o
zCwU#Lz*AV2-E+@9hXxHApi!enMsaIWZ>c<8mxc`+qG{8nX1_I>H*b#N!-tz{3PDkz
zRHIbmCOWPm_U=1i)SaNG<=*`VxxEn@G;E9$=bpu(Q|I`-A47vi&A5%xsJSH?-j62S
z*YFYSI&cElGpX9B@}7VBt^50QOk+g!>_w}2mw>9mho63qhRq(~XEZ^B#{3RiDw^oC
zxomFPH)+N!1Kx>KQ~4Q<(5P8+eEj(r$mO{xE-%Biw42ztm7;+5u0dn|)<z8w-76X!
zw`@d4b_U8SXrZ&N8{kcun!?ZEIpc9nc#Yow;4L#9WL0%3!IO(=sS|n58}R%!z&GD~
zY}B4CZz-WvUYd`7eWK8W`<wFpjd)I9df_}_rN)Pw1YDj0<eb@)(1`DE%Io|6cORRE
zii+}Vf=?<hoe<<TrnMFDr1I2&A?OwH{}&bW9xUelQpksFo&|cD3}bE8a!cs_o^};W
z)~>*y_+c2N&dCX}X84(=u~4FcX)~sq@~4qy{y~9g;LyVt?K?RTdR@@6RVP9(m}Fdq
zK|_ZbDX2Vd0$->KXrPpy9b8?U(X~?tbm`a{ojN=YC;O-2>qSc&(go3B-O)S39??{J
zy#gIDD9jCs{Q{8G*Art0xMSK#FHGp`im6c{gqR=3diY~e??G5JU@+kohuKtlb3&(K
zLEvI64P1+9E{ie2X)#h<mh;%<m=>}YGb2`FVgJ>b-+vkQOgW4rvrb~gu+`WwY7;K4
zx`a2kzJW7KPT}I(3%IoNMZA3U4ZQQ>r}*+SWm-+cFTZEt-~YOTe+zd1_z6FJ_Z`0d
z`U`yV`N#P9^85JU{de%zTW{i(S6;!h&t1alGv{#h*h%c;O}~A|PHfz?8LKsy#-c^0
zzCgemrw+(t#u(@+A*hs|)#COW(BE|MR>P3BrB=xV!lP(w`MQUHfJr1^uX?<&7_<qD
zLfeRb=nylMpi?4&ICSVg67BoN!e;nr*vF5D%h*})n6wB!)0e}4?ivIy-i)9nn-REJ
z<7u}dX2X7rJaitTPriWR`%k0q_Ctu-vLAhSAHneB7ck(!Sp==$1>ZHBXl;)p{*||p
z{NaZf_3ran`@_40UO5VBU5js}8R+E^a(Q*7gkC8vZ!xVe6&+z$bgK|Ws_U!FrJYmd
z{7v3;LoJ1kH+jQHElVr-Cd?j(tg<Zp`{r+O^Y<ha36cB9nx!>ZkY0Z6Wj@{2Vwa6c
z6DG1yH8jz;ci;a4UU_-BQFtRqjx;b6^rX18Pm!FQY@nxojT<*M>(-@p+qP}n4EPQn
z6xjIx4fOUS(+WMRJY9b)^bQ<5jr~VYb6vm_pa`)Bf+d678XDlW;OiFHd+-!)WYC7@
zlwjtZ#rKbCYzab4EyB0|1U~%aGYbq25bp!889zhIrrdUaIc@Gd9{(4n&X~z<`kY4i
z_J<#+;((uj`xQOh-O#u>zgJ^@SN;|Or-lBfW&QmR-$ivz87ixaFnNjvdWz;P8e_%E
zrKqc~Mon!c&ucMe&X|fOx{loc)mI;x`n}4^0)i?RfBgP48uDBzn(&%wIe*>^0*scH
z_sQM6)dr9=W=!BYwtU8S-+fB;XQ>;kEXy$=MQ8<HDeZDeNv;_pSFB-kgkFIac(k<w
zp1>`ocO(4<u4UZ7Z#S=C@!I9+KhmP~H0E|>l9GBPV#)GlrW~pVij<y#UN?I$wCU)C
zR&DL@OglUH21H`%^0kECa6?9o3=s5!8G*E9PEHQ6x3@#9r&{6Zr&^+Y+m`6k=?OU6
zJp~V!)(G(KMBCdD!+ZG?b`cm65rh$8ffyMYfWGd8pqC@Y_X@xWKU>;TS4`;}f?56g
zVoJ~eO!JS!OuzAr=@{)i8F3D&h_#)L@m@<XCwwJl1TVyl@Wq(jYY`65Ka5LTpT*&&
z2eEeI8f>1i3G1e=#g;jn@yg+s@bcjo@#g8*@%ihY;g_#}!IfX={r`OpKmF@B{O}{;
z_m3Y9?7sW@OMLz1r&M<z;G+-U#=GyljyK<U1uwt+5}udByAQnm2aXVUd$4}v7OYyW
zmfB^erB;@9{P^((cGid|@Kl#)ZMd)kPs7TzuJanmbpg*aScwFD(A75(cHuGT7TFii
z1ocGghyiHba|l}Z9&S=absst2g!9;GaHpm9okrNrT!Wx_8xXd18=}|jLFDS)2wS-m
z(d+kNz>Z^t-UTF|ehDKFUqH;(1L#SWH-H6jz~NKyU%Lr`8@3{3(;kFu-;bWBP9gEb
zmoVz`Q7ruO6t4YEuu;7g*JqnB_Oz~uz@zQ0q1vm?N9k=IL)BJXOPf_wgc5GQ_|gRn
zw4~e^V<#tL>g>swFk>8?yzH_4&=zF#CSSF2^#izh`g!Bvu|qg?>>&CM>|-f}+}^8K
z4AoZ_@^kYrb<z|9uMrwIXl#~0{qU24p46F^Kd{gXVpfn%oH)@aJ=s{b-U)<KddrqA
zGs<r7-n~YtTLGx|2zvYX?>FlTK`$ed1x@vNgr1b21$s@e@5pI<^W$&Wf8->NpE{3&
zhmKMq3Vcn_kO17Z_b|5ZJb-<NPT|upe?VSQ71A;b;O)ag$732aB7~&o2)=_yPoS#q
zwt?SXe$KA_JQv4LAvCI|1#~=a<%W$obn-NI9yo+cue^r(yZ10<`V0&7_}l*V>#ry(
zE5)X5Tlt#jfalP~!yWq$9>Afahgsl*&9)}Z8l(S!KE&f){Q1`{D!uV&*8Dyk`I{ek
zq&a^7{a4!BI;y%FOrODXr|ZQ1pMLr-!DN}MMB}5+pF3qhCk3Zv^JYAkyl$n0RTb6Z
zpZDr$lS^1e#-arkzo7TaFW=!FL0GP?GG*%DSA-f`=Q6&os3@QXCX8vDWwFaEicnfs
zXrN~vevE9w?|-59>y1CKaP?C39We;~M+`R5lf4~1HW{netwpasy-m3bU>OYLvVk(I
zJk!A$&$M?!+m6og3yL8f*PGD->YS`PCpBgm=mq)_c(!QYz8z1_qj-wY>(sFgI(BFY
zo32lxbGsIB?A8(<c5M;jVv9k+ei#`Zfqov|h_H3UAa@@m21j7HuQx_TxntY_AEfjP
zL_$a)MtSza6#vnf<TV~~Hi;N!HwN(@Q!vJV7AA$wp^cq^>Cw}$aOg5@m_TbeaT#_}
z*=%371$)=+#&gG?!%OF0!o?F8@$!Y2@byPu;`>j(!N0!!5x@QT3x59LpZNZpZ}A=D
z$AA1l*nN$!zxW(qe0G`Y?mc`+S^v)4ui~{=U%*Q*K1ZwcEY6-ihhs+xynTnUW9MEf
zylq&yYAx-*T51=Xt|=;9)_OcMIX5FAA;I*~8y-8tC_ANmx7gYc_yzbAdf}$E)=jCR
zyu4v6l@}TXTlK}Gt?fiB+p%vfIuA-j$04I(lQ0PmW2VD#>`Zv1E`rzerDn86=n}$i
z^<MPdd=&k*9!JckL+G{T2nO#xi<I+knCU4~&b@}ARCUpW-q2&`5r6JQ^xk&_z4q?G
zz$1sz=ipI9?%#*#vwP9!<t^|zF%APx(;6Zdg@jy5EuZQHUSYlJ<qA+JmB$;^z>ZK<
zLINxFN_m4%o1KbA%~|Mp^B*|69~HF3s+X&vEy~w8-imCL))m9e+uanThD{pdw<~|3
zu%yUTQ0EusAu>AB6hMLMyKlcmZgvjFC8rqJwP^APmyNM|*B)bWrSPQYq|~giQs}x^
z>-4m4-L86C9dE76i;Igh+nY3Lf)A;5)ycd?ixvikf}@}(YpXiGhoQIs=vidsSD?6}
z4n-xEw1hR5ef(~MTA^BTGxAYXUT0KZdUg>N7L^?LHEP@pGiJ>*pVPScqb8qXDc_%4
zREqMNdcLO;=~S2Fr_jC%jEuM5|IoA=tKLube|bg4n4CJz0IvbR>o@=Sp7yo?&Tg(2
z=rMF&t7>bISCEg2stWWU*q{I3avpyF;}`t(_Z`fbIn{C<7>%V2xlcV4?%lgXF#bj8
zQRz{k3w&RE`JtJaPO1CL38Se~CK%8Ocx%@zH%rCkkKRRHt<t65=I7M1oQ&rK$MQe^
z^{;P@QY<abLUlzx?%l4y-P@H^e?_RPE5q&ERoq`}EOup83BkwvttgMbAs2bPHv~S_
z>D?6YZeBB!BmHviH_TtL7=4EiG)m8EZ)eY&i@EdXner#mA?TTgjJ939@N|0@Mh^o&
zZ~vZ{zj!SM436c~LC{ko%TR>Sjs*nx!Opfj+P7<i#~y2er&~RNwp4npTRn=d9iK*<
zC!3<nGmpW`t~1;^wnczVcLa5_MNdapf-VGooxRY@&K=27y)e3WIEIJ&V4%MjhWG^`
zKBPb5{6}DzYdi+p$78s2BE|(xz{IFANDLc>(J>=2EiM&{#xKLlnQO6W#U|{f@_O_2
zxA67X-{O<cXivZX2H)|CfBEB&@sIDm#}D6rhkty<ZC`weZ$JMW-+cC&iLV)-UZ$1(
z@FQB;cL}>U@W$&e<K>s0$Mesf$NBSTaQgHq96NR#`}Q8hj_td#Nwb(y;VsdmKJ(_A
zCh!RpCK&4^TWc&W3y_vldulFM!-Y<`q_q`#M)ie;!Y3pQwtN_LcJ+WmP#9cd`oSS)
zfa!qTrSAxI8=M5Y_=)JI7&R5P$+O@*X(2pku7oQst=r7i2wAxs{dSxp)SkuQ0~gV6
z&shvQZ~>!Ezk+G6evI+Yy^S%1-jD<5(0k`m#GZN{qb|Hc=p9Dzrgezfy9Lqvwxah*
z#tW<9c_I;QH+iD}xp-VdCW;8EQYyQWnqsc!)6x=zgrI;|MBu6OuwIuCV%aPVmArY!
zj2~^Ws)XKmKM`0Ze1g}Pp{Sa6*d!*+Aoy|#M%gk8^a_fL`1Gd5A^37~vsp-FsboJJ
z;G>T|GGO!a@Iv#Zgb|^lh>D6f4H%`RrKUcwU%!4v$q8O6xK2(^50s#;hwAUN@BR1R
zH^=AFGH0ZxnYuqgPVc#V`LY>ZqxJtU=<Pdx4n;M8BB!X51+j#Rq>}%?IseZiJf<;i
z82`8W-xZW0PxEUOmf^sW6XyL5soq|E^$iRf9*2g_9_6`ehF|`mdMYk6x%;y63z1(^
zhDp<?<ctk&Wa{h+OUqDD!q3d(1BlSmcy0^yXjw0RWLj;lDo@r{4H{Q(C@oVK0a8Ur
zG7c?hdj+O{{`5T+Spmk4OD623b}hhDsF#NN!u|dCpR~Ud%;yPupMCy5uTcgqry4s7
zA1J{71A5_|cV9=-W>kHQZQC~zsJC(FZY}N-dedi2vV14T4?li|GAg_(0#99D{*(p2
zQ)LvPEU_9;N=ozj`9%hJC1rfD713o7dbtd1eV&2d4NZu275~2a8x{z9!z|EKLq`0V
zB&^@G5wRmj!q4CTfzngfq6Y13{P1)KcRbbJ6|FkB&}zhB-lDY_I5Yu4VV252lUtth
zSgp6tw70G<PH5fw3A7^Yo^JUF+C2RPx_4-gj!!*}PES4s=MJ6W%JAyk6(ROc2<T#i
zpsqF;LTMit9Em|b{^-LeeP16>4Dk=cF#gY>z5~$Hz88Ag55h3_cns??1o7Sj3CR9b
zgo7|8Aqlf5O~=9+i?L?e2An;44zIrY8s6j6`8^hlcX`p@z5F3Q{On_V`OQ~+qQ6hD
zUB;K6eu6JP`IvDTpMCrhKKkHYy#3ajc<-I}@%EeV;*Hl{$E&Zrj2B;c9?xDpN7$Xf
zi4(_g_^|rn?Z@_QJF#(tMq5}qAJ5~{W5x{S#+zb*XL@}zr0x_dV1i)Jn3xBaR=^Yd
ztn0pkL2&S-rFHW{XE$$j_YZ+%R3A9?9fGdCXlbJcqC@{U+Sv){5;qB*2|b&XIfUH`
z_$}NF4_aFH1)C7E{t)`?J&!?$pU05nFJkD4mk@XEb&PxQ10+BDHWJUgY5*8}?0F2N
z+8cQAEQTDrh(SkBW8~SBNV;?!gU@V1<Z;^NQ*r3L%NtLuYl{KT4yMw(X`rX!OQpOK
zi>USr>T*z2EAZuUJs+ANsF)U4R=1$40L9hCn4}w=z-!#1F)n{`+4TM|zg3RR;tUkm
z6{7Ob3gnb!n`4?k(t>uYDQ{#8@Cu5kB#R4>!h)y*BIsS_byde=)v*Z}&8dtU5gLtX
z>jW@?@6%5|eNbSnz*0!*33|F7)_w21^A1dM0zx+J<_(NX8EX`!_DShk>+~LmUKW++
zf7j_X#g=`?crN}%L3usZ*=-b;R&bxbqvd<1=kS~;<vW3xS60R2s?d+_siC0E&;Ie3
z-^|>{mO8;^xb*6qxREK~6&d(t<`rVxWT`MqL&dj0{D_L0TI3WI;$}uBatOY$DdP;}
z^x(M6`}{^mI*)IFW{(oQT$WW;qO7ulkSaFkN%!o3)Y$v9ltq{^J=L6N{buzqwKn!#
zym*0jvkVg_@qcJP<HO5uLUZHh72L%8AH4Zs``UFYpsdE$*Mi{fJ2j>;M2bz&Ysl*@
zND6vVbW(Qpgj^k?vQnL(3sF@ig(piZYg^1E9R$Cl?g@d<3O&nk#57!&(j)i;y+x~*
z8sH69rWHbOV(JvE-LM{EbX6Ku5vWjkRO@Y<?*4eHgD1h~f!3Wo%mhITmTtm;A%b2M
z53)c{qX&FxOFi6O(W_?!?Q9#gZ~Z9R5_nzPwMXYx9bwbP2KH_3Vc*6U-Jj|Rht?h8
z*|j@-sQ4n>oY2qT6Foh9ppTC$qFo&@(AyV-J)_afCK`k6hhac>erdab80^>!BfWYe
zF*F8Kh7QH_QE`~fr_^#PtesnS;Kk=&#`|v(TCcs0ci(!?0<gE<#k=pmN987%y@ik7
zdmkUu%6!Dv@4WLCUVrO#y!Or;c<p_zzyA(idgV>p*f((L`IiX0=S<>&lP6CYX|JdK
zu3fvab;~wvSho?YR;<F(#Y-?>z~fCaW%5Lf9h+<b_pq(CD!##kE%kU-m8YR*TGvwH
z>P^rQcwIgG(9zu&?L7R^B`69GeTJZW-(d!N?KH8^$noehY6^ij8;(<!z>}~GS+y5I
ztM?&b-9bd`JcC{bpT$7I?%8)R`lTnYPxt@;|MW>jK~xWsc<DVP5_ls{zJw%N+@!Ow
zA^zlx7=G+I1HQpWE+FaRD+J?9=(%?-Y?lm1`(-}pwA}%n_jg8{9j(ywOeB@w4ML9!
zrn(Geym^aiX>kR;Is&SmHkGQc=xzxLZWST7rT}GkXt^p1Fmc9YG-yUJ@FxHCo6k^v
zw}!yuduV^v;E`6GhFmH=Q=fOgkO+FE<)!9no}HVC$x|nyQ4@<|`QW1uP{zV%6bykf
zGA_YDMeCb3Z8BC>4N8wc{<x`|)A<*$v@Tc*da8F-K~{m5y)}LOX!&m5xP~#KlMDp3
zPfE{Pr>D=*x>R9y4hvh>edrnG)fD?qK8NheyQcqLUP%SvQuW~X2vX`tcQdcZq<2@B
zmVaHnfrc&A`q}_dy?W8A7UJbM-{<japlFUItJfn_9g|I0mOKN$<Oxzp{H;xz;)j3z
zjG9}wspd+kEb~p4m}%3e^E_D;^JiauNd?IBdjC7<eAV8nM`cYl3aEJ1f6+AP@V7PQ
zZ<E7OO0LP1EDb~r8E?Gy2Hf2}%(>M){QK{}VQT6m1FObV@1K3~9`Xxsnqg`iHm<@W
zj|$)o@Y2iA8MyU~2{%el+caHcU45mgmz%D8;{Nyg@~e+ceO*;m5o+lmN|f}Vh*p<Y
zwz^u1Ip5gYipmlyzCr`Mq7ute)>hCnEx4KK560c9bMmrvD`{&-V0hw4j7&<Pa9hax
zXbJ)YS?)qY%rYq0QYITb-N_eEv~@?zcJ73pCwzkkV9CmDd@7AJN>608%+(MIZ*Na{
zd3j>Q@F8&PVS`R>pF+oHTA_RUE@;`bCG6Wf!rwUvUbbFv>FkIuE!)7QOBeV$*wFHJ
zf}dk&c-ysyzkO%)@o>iwuMqTijz&=HAjEX;kG|aoV2E=+40ek`Y)~W;d-uZFp#w2$
zaDU97J{89gAENc7{XBOOmo8qyOV7Q8S6+A(ufF&yUVHg9y!z5Bc>R@E@z(2a;LTTF
z<F;3fihK3-SMl;2ui_Fd?D<zN;Vf0%MZWjK`4@2Z%tf3!bq2?epD^QSRl%)#olToG
zv}_$#EL(v^3l?GSoY@9=lO~SG*ksz*gm{xS-&k2HJQX(GRKtc1Gfx&hVGQi}bWy93
zstH0EUVgr?>EUAf<aMXz@8B7LHiTX$LeH-E5I78?!W$G1o8ifDNSp$P<e6}tx&&Tx
z))<u+v1UIaHylCureo-}=K=;EeHjVQet@a(eUE7$e2-Djzl**H&trgEZZEu!QD<Jp
zz<p;iVDB09+j|Owxqa;OFJsE<&%tqV93Giyk520y(e+>#v_IGi9S^p~@Ye<tde;ay
zK5c0`N|bYr(38C_Ch!Ekf?Irjo9c|vE4f=vwO5D=+LQ^?Sx^Nu7LqT2_!>DSxm1CL
zw7|*%$M?|==T+pH!r!EsrO=d=(H<)iC~xYloD58|S}KC?BdYUk7ETNFs75AF!K&43
zj8#=`$KQVY4Ss%pW}mtbNzG_oA$23Qr<Qf=HVsG8(81licVW}U4F&=V?Ncl9!%9!k
z6EyWXQhWOk9OCz~KyT;a)5xp7hl0v_WET<GMJ3$M?<m#CNY5)kw&_q@Y5LY3KYx+u
zh41C>UcG+3slWUA4=UIu1YzSAaB%gcRW2p;cz*c4lFDjJou0bS5PaYN^b?g^6*5#G
z$?uc^>Od90mtTMh*1Afbs}HD%^9l=$t!>nl=auK{?q7crP-V2{8p&XRzxDTC{^&z0
zy<$vDwQOrl<^1WVpV8%<<hiw|y*J-{gQ}Y<nV&D{ef9NaK7evjTUQECFBh}DX|sj|
z+^<x^H?V%gYO_tM?)-&QsG&;`^irqsdvd$tn{Piy`Tc%?b=+TEl*0#5mZ<}*;R9Bk
zk@X%yPYDKePe|n{tbO!!uViT6-<yKoHT-eo3Rdxd$Bl`ntz}tEPQ;4UE77-qKLm#;
zy`3d*fQF#?(ycalthGCy>F9^YTDhQQ8&~uhFb2z4Z^wY4BjHanhDC<amRb7hX}$(G
zcfxAO5DXtO6rlkuxShJ9?Ne>gtzB2xceaC57khX)xS;DZ?a-Na*QRZ2ICN+Ym#*#M
zN$`2wcS4A>BYHXcBe<Ohg4%l{qI(D;?ZOe`5{)5*?#P(ISUi0;=A@=#*0iZux@3{*
z41Dz1F`PJk8pltarVTxVb7#)s{MqxkaP9&wp1*)+FI>cP&pwOipMA~}&#6J;IXwI9
zMO=9H0?u7Lk2B}b;`F(*I7!f*IB|-~?kJVr!3VI@uretKbxC=6q=luGb{xSrhUzX6
zqedqpJ~09DqvA0#e&hp-8x}%k7epHy9$`+nfEN-Oj==Cxcn12x*~<;K9&T_BiG*En
z6k5Ca;b}%UP4v@iFl_n^gVV6l=n+2wE~8W7Gi4#d=B-2YvTcZ7z5_j1?ndvm2QhT#
zIm8`&8KaK9gM<_BVf6XSnDO>MkU(`e^!T$FdHQ*bx%euQFTRer<1d&<pyG==`8>ux
zcMj1zr=d-%3)(F1hK}31qVtKa=yawt+8^$K!7t0+UNzlOO6zDh7=>IfyhRIJZwYl-
zDZW#R(mT9SYpEnE^H5w{jHz=89ktvNd|&_c74piJw1CH|0fZJ<{rHM&2?!R9MgkcN
zLSbP(pZMx`N6Va@Nw8U<r+Ow;QwTa%3##kz?z`_=3OhqB=UNI<vbzrhuB4>oK|ejK
zZBtl3tC1iiP;q_Bmd!>1?%uW2^aT_M4(#8D0@}!Iwcs+29kbNQsm^ZSk&`H>s5KTg
zr>M+;$}~*0c$CY=25O~MwJ0pBL~%tmrp=y<hE#z1f6q~6msi)(u9YCB&j6k)THGeh
z@czf2nnsKg!Yz;AeVhQ^*kXr2`;tm7HwQP;(~zA<B}fRTPMgl}qI1{~pM3rqx91W<
z{49R=hWw2srGzmRo*HumJ?jHOcJ}LUzA_-0NY~P^vE@2^@#Pmtqixh{#YA2QtGyM-
zKL6sfv7Z0_^`GYasD7|_uNVW*;-Uh){L-ZdT~{WIPsZIlb+~u8-mJH1-q`fj`~0(y
zP*j+MTlG~Y)xR1<D$0wwEJ9@kodfq(R+XZ-B;WMUE8^!C6y(7&9e}c8tGXeJ=OO)m
zve)bQ<C^m6t;EPt!x5hpi`1D@F?Yc{L`C<6KN%Is@~HG}K_Nl#@Nz|iC);`9iPj!?
zoX~6A$qPecCt>T(Beb_8jLHj((xgCwp6ppL+&w)J5fOz^aicJ7;7~Z(I-)%R=g`d-
zww=15OWStn)an^@XxS3&pL_z{+dKpN4s8j&j_|VYPQ}+10rpOCf7$_lZ9Eau)gK|Y
z{_wZ)MwC|=5(bULv<Xu&aohyVm@xxOmo39a-jF-^bl$aZKX&gwfIWNyAEYW%z1z{F
zM@>Gv<HwJiG*2f_o}_X+^?!u+=@=caL;-rQY^)V_YW-7cD2=04SB51^mOL0~p_@g8
zOG?hzS=Hl>N#Y46@Dk#XoRVymo+d<()Vv~rO8FjUInjh(Frnug8UzOq7j$;AL3<}V
zbfLBH9vFsBRDA6jUBfiwtS35z_kvBop>T~K3(w>!@S8Xbev{|IZ|Zym&R9YKu156Y
zO-R^%7D@YGz{uS%V&wkUG3LyNnEA$c1l?O0e*7YmpL-RPUV0m;uY82bFTRh-FT9Hh
z&%KUBD%*rJd*QyUFIr9Qh|X)fqU-*yXm_F`S{-YPHU~Rk*lQ!G^R61WmC}-y)E83G
zF>b31i(b>JG71Dc({-g3<+saFb*~Z$<KvCWQ(XS)BUIn3GD=UNtGH8%jG|1_wM7NV
z8f9fA1Y0&rS$G7#2@}Sdf~f+kh74Iut7X--P~(LjyxMPVY3<#+w^4O!N!M}Orq)@F
z)KDwCj@9;N_n{zAG$d40r%%J#bLUvF@=d|ovwNp0w2wXd2zKq<VY<r*dPPOWd?0Rr
z&`@#g^f@!CqUO$D$Sp1>^ve00TkvxUsV2=)fA=q{Kb{xDEj?Ryp3qhI5~(PyHy|LJ
zs_xCQE>P~|{asjIem_lACC1B^%ic0Rp!F8iGO0#oZwq+LIN4u9u@Qm(;m03SA*o)`
za^4iOv}y!WE4M=MR|8RFD(Fu?|HRnEDXA9g+2YaW`0CrQP*GKmCCe6@Z3+)hLgN1V
z<&WQ|daGdhIf$leXyX5{#v?5(4I~PIPhD5aXisaZ%Q0(a>Vth>efbHkY$?^9GBg($
z*i}`QavRkxAy;2tjVdZWYo}!04@xIxnGPV!JQM}J3_@SYUZwOj4dn7wi;<8tg5@#}
zbLY=Cvkgjg`1#A$s@t9F^!(uI(}VW5qYoZ=${CM6?ToH=ffzSwA@&|TjediMnU;=F
zGjc${3o;`#ynTI*?;kp7Fj7X3fv=|*q1PJSJ9a|1_8ri*T|0E3!t26q3P*15(YXuU
zx)FG`ws580b?amU*S1dZ>gWm2P9E^=>VYtiPz<5cNf;K7gpmoDJaIDS&Yepuvl<&V
zZ^HVGo3L)fCT!Zg72CG&FymeYEA^n?vuBTimnxoRNd&Y14Q<ykI#%{($BrEabQ)Qq
z;bck+r6FaqNP?X@9b1zJSVvk|8!ZIAgrrd>g^f~Dt3Terfdgs(sagp>O^O~GW|>7!
z{Tu`Mu#m!Q)2#y@Yugf!v~PvRZJ)stb~b3`(gW>%ywN>83|(o7yF^j}_Ui}xL4)8!
z$cFK09-KN2KD@#Gr%pr6!lf9zW-~@^J&2UU&mv*pbBNvZJcjMPgi$A6NAiWYFz&gx
zG3BNAFzby^u<*TaG5_r^G2z))G4kkf3_7q2{SPdM^P)(!nb{fLH`}7y!S3jAvJ;*m
z^jhz4hXF4P!gc-$a8h|C>Yqm&t0*SyEYPF%rOGRk5+w8rje;w}{FU?2;1L!Qs=Xs;
z4)gsbD6KCu;LD}ztNpu<3aJ2|z8(+i?bHfaR#9r|Zd6G<a4^*m3y9)}AO2zb+gV==
zU;>%0y|txQFw-?wYpt%AfG0Kg?|=WB1tsT!8dRuGPsim_!R2b~Dj_2Kr)`<EOuGra
z7A@4@u7PQ|()$GuDJxBFYPzHlRL4%AM=_OL5v^-^?QK&ZX8>kKXEa6it$V1y2V+-X
zed`@G7A%c5R_J>Rbe2X8D=0^dN;GMr^2^GRSx|^E6D?3vU%mG}BJ}t@H0nZ;Nf@W9
zbMk#Ce(}}U<~O-}dhj)ECso+5wEk(zKgfrH&Y2zzf}ZyO^sgUHwZgawDTKcTYF~cy
zCBd3y(y%wugNxgw@T|*EsnRs?KPf5UA;9rr#C`vH{h#&EKf8?Dnu-TXPsjc6k8cd*
zr0%TX)9bogDq@o_Pl-zjeL~NgKhH81fEuL;Jmt~TG@raKn!xGGbxq*@8<wtEXn;4C
zc6`C2`4}*0AOa{pjLK7wiC};D1(G$s4ruU9CvQCZv=bhC$^p)9kyyHNJGSgNh+h2$
zn(QBew6vk&N-q^+D&;kCjk9wPcvJBu#E!tAetqHI!x`N=c0h+`o-vIYHtne(o@s-w
zt=qzh_Sd;nH#m3f2A59VVb{6?y0_{G$94{|Z|jI2U0f039ftlr2cUni0T?!9IB(p^
zrsMCt`SY=G(IPA&h!!tdie<}IVAaYsShse)QDhqIFJQ@z3SL{bY%%s!F#F%ocJ0@(
zQgnK+Kqplv8*5c|f}Iqmx-zIsihw6|X9b=WdQx}-o>JIoxS0pfN6(&=)BgSs>hlCV
zP15b^?T+@{JED23$I$Re7L2EOqd(OgjoJ}_-P@y$ha+sm{bAoT1l^;9&^;y;w*8{u
z8aohf2}96zKu<g!8Hf(t7C0pZBi5}(pQX!@wC4!M9y*UnXQ=AVy@d(q-@^2lsrX*K
zOwfIS@#kMh((xC$q|H5Z0mBaM#=rw>&~x`RI4lfA$NAl0v&9CT_I5&>qwVm_(Kcwi
zzdiatHvl*IS9+@$WrAD18Z2l}8O409E-EDiUD;iwR-*E|RbrH$p!e*n7mUg?^?HoS
zvnL}ZH3d`WrNT4V3+s1oFiLOL`c-InA9_CiKG=6~9~Jx_-nfaDLdN$-@}VosX>FaA
zN|Vx)ZB<8NEAX@w7-ivfy{z_CfV1|~({X~NKFie8$+qTbY-h2tfKrnJ-}m2tXWlDt
zKJmm8rkZ2N&Rtl$ZUeXLe^~(a{l9)ie!0feRuWQG$S<wp_WR0>QAgkvSJoQn?cApZ
z5J9d9rp=m*)90VXNh<E6C(q&F(NhoJBe-euAxqk*A|$Kz)+ofL_~^4QpyZ`ejW;qf
zO@qd?S+n^*3-sP66w9a%7cX0iW{+qTM?<){yJ0)+`tg$|O#QR<ybKyL&{Bt5kc&xE
z{!`!f`B$G&{i*YB8kVnIVz#LfLnAe`{@Lf34UDw@7?q*GcI@au6Gsm3$C=Z|F=P4^
zV`CLtxA0n1t!bMTh?_U9!-?aEaqz$%96fRXr%oQlci(=Ae8Se)T!u!3Sb(Seh!)^k
zQ%ae%QjD8?P+cMP{<wC<G-j+>w-O17BdOe{V#chQi0;|TC_GceM1}?M-t!4?hEJd!
z%U*joJpK%!_p}`g=s;S?Q)Z5gsGfaH%WXiAGW#mqilyZ-G%NyMv}U$8-4GfafTZ}5
zh#fi@Ap!nyw6R5dTH8*9UB{=}pwm-rVArk-?AvvN1H-Ys4QyL=hF#n4=-Q?`f_!NQ
z!+ImYF9eYhJ<Yt7<N2hXFnJQDOq+_S(^E|gl`Q7mx%04K-a;&zPYBNEk}wpg^fVT<
zq|yX1RaEOnwT8f^rM6oEr%=y)E9mCUoA*H3Jq$aw#{Mtxtk6pwGnz^--Yh-5Jb8}7
z%@ajkQgoiw13rik3lBd}bh7J$M_WIRCWKw%r<$VSQ@n|uX0&3oZi*(IAIGB(t?`Is
zYc#ff8VzlqLIe9&Xyn`uP24)7v3o}}VmuYfhs~G)7_wp(hEUGOZ{33U?YpS>PGaiW
zOGr6!5h<shN7B&?7=7${4BvAaqYhuhyw~4H>Wj~z*Y+iFSvVZ+C;OoFWE-@ZLuJ0X
zE86dDi>LNIjVJd%jgAL9BlgvixGCrnaAj)2Wfb1FfUl&k)P$6tz$fTg?QQw3QryVB
zhWM0`55U(@V{f&7e;C@h>Elh;mS3;`h90y|4b@*yYC^%|n>26oKzV%p>Bm%B*(Uje
zY^)0HWZoZngq4)bk3as1&p-bhKmGI*ifQ+x%B)sa$A0q3CuX}WueK?)&2(s{QpzV-
zq~KIvCp9M+3Vyn0tjDX)ta*z^X(3e~X93r$^_y`so7TVf4u8un6qMJRsUVGNlhSM4
z%ydza?UntF={Jb`EMWWg$DeWQPoT7_9yx{OD6PJYIg3`9_cnaw2^>3jk>{|$q?H;!
zjY>}SeT3e-AAgGLnK^uqIwupfgkI{*Sv<G;+(x+k+2^K_<I44G=%EhEJbwyxXR)rE
zAPir5?G@w@GMPCUm^gX7Ip2!QpL~eYa&;3az-P3fR<)-_3@N=&KK%f%zV^a{7TWO>
z#+a@tnx?b<R+VXW{p#ya4Jg&2xp(hqRMXy0oM;*9rfphUm7rB23W5S&DebW8^Kx=C
zd5v_9E%kU7&{4784^17Q8SAbxe!KREf!?OA8xfZ<67v?!NAlQlgk1y$8p|PFS74AY
zIp#{GX9xe_ZfNlMGY)vXl^t4l?18}}QgGn#S)`1gO2|baI8<FNELlOM^7J}Pbygu^
z@bq*?x2~NK78-=O*kKqIHxm8(^npWnTXbyQ4sD-og*J~rP4KBfqXRm(v_#jZJHfHD
z10q6tA}p*YqN4jyiVh<{Mj?(ciB}q?lrb2UoP;qc$)<%$PxsU*shB=>I%cNM#H{Iz
znKKQjqzGk$1uhNz7sS-~pb!|Xq1W24<8-`2Fcs)jIWAlDaJ}6Fz>TsbB78WMoKOmH
z)aWErXQ%!@0-o%wX}I9|3XTYYZ?GTiUG4E?`<7_jvI&9L1Wli5ibky(p=tZ3XxxsH
zze5u=>C_C3I`ii4%tF<bH?d7~G<JBL;A3HPeiD!SbwKMd8?=jZfXx6;xF<wm`06<r
zyK4hdj~~Dk-ZaTnN&`1;KrF3$^3ju+`P^%me(^P=UU&s_UVatB_HRP>sr~TGSa&=<
z-Ud%i?to|J^5L|h1KRCqjkZU4%^2N|*dX@hSfmkq0^WZ^uY|xWGjl8yQSBA;wUnQr
zS5TEtD_e}pJLP5s#<8<UFmiOP0htLFoWzMqSiN<PC6Q%iF%?Z7Htg7pgk-9_rc`|_
z1QF2@v?@#S&G+9Ri#NRT=BZv!08#;&Ov_hORc&A<NR^e98RaM7slnsnhK%>#d(Xg3
z4=;62mf90QO-7cY5;Ly0fZ!`BDWxLR@G8q#UB#tKmsqGLvH<gXC}>NQCrrj`Z@-V6
zqH+{f)bV$gqnwhixat=2N~`((3C8;g2&4cDs09D{+cnE}Ld`cgf>!trD(QRkdAy>i
zvL5GOc#Y>ros^qn_L3FIF0DXDQ7Ohxr;-yyn?8cq-@j}=v#5fA%_%@--7SoxBQo}v
zp?-YnJRf;QMJD0E=Iz@|!U5BgOVC6_M`0cBkI%pQl1h%siC`(OETPh)_2>Sk%^UF{
z^PvG=dgcw<+)M&-+ylS~SRa4#K9((CXrQKe_PH}=jBLFo8{)FEN{L7d;p*o2;61<m
z`zOqr%?A-*D+Ev}$7a0VR`>~eMO1dy#*5<Og8#C$nc4q=o+Z1Ar2*qw`VIVg^>>p*
zbmNvyraaDDun0qjjv(v=Ju*vWGuRvcK~C@u>_X_ZM}tS7>Q0r{9geO+n3OsnYd7p5
zgod*qhnaNd)_!JMhHCf}ffp7YVL)i-U<(IF($n7;0|ySk@S(#HLLcqk!wsF<c0}8!
zTBBntOCP=tE!(60lkL#GO;>n2d!eW1*yz(ALr07x=td)sUpQ_wfkJzuh)aq$Eu^D)
z!|J9@9-WMF$>T6Fg%F%L**x*3$^=9~ON0IudeRGIvZxP-wrQV^)qC__*&Y=VsXD>V
zT5qRTSS<y-{~PqQZTN^-^c^q&-oAe3yh!0`T%Q47xTbClgp-F8p6SvW%~}y^EgPXR
z)m>w%yN0cJ1GKh8!}h!pI`IbR#2cwAVQAM3&7E4Hh3jKz?A#p9Tpz{L0j<#~npU*G
z1DuAqz$Mlf4nu<wF+C9}yH_J==PLADH48q|Mk8YBbc{Z@19LAui}^3TgehmwVdB|i
z7_eg@?50Pd<J2B#KfSB5wjGyuM&~UZ(P>|MbUV`xolkUu&0$*%eQp?di&43iJp?_1
zjY_WM{=_n++^70HHDm~UQhGWzzdR2$cdJl+r^<A7QPlofi;~(B7UmoS&7x|q6L`hy
zqQHWe!<$_x&C{qnN-9fHT2YGPG7UeXnkprD2{tt-RF+qm*3>dWO50>dm25y@Q!B0D
zC-7M-@3j=*6goyr0j;pG$UrYGJqwkU)l?GdD^|><hFED72ft@&ndy+6m7{tz0@kE)
zsz!Dp)ve~arLq&S1Uu`pgsM#-ETW3LnNwu87g7Zll+~bwU@X)87lb4i++K%@`oB;>
zcouSBh9&?nP~B&}nFJ`0)>vuc)z9xrCLzu3*QqYEiz<wItgOF9C0R_x$?t5kCg+(}
z-{Q&&<dqbg(HV7jlu8O_OzvM?re|eR4Q5l}(Lz({-TUh{Zr!OxUSW=z3_Mpk>lh_v
zJO_Ea{(3M`#VOl~mKtH9L<Q<grxaA>73xG>fLdPDTek`M!Ym$}V_JBn`gE+ip4|KM
z7XSM_9#_fpN~mUBH>yriOvqL8nk!#k1(#BIYRR`kPr$Rr!((o*Gyb5$`~CVK27W8o
zuEC_KEdMj+(AxGTr=kcu@+w$^0=(cKXv^|RkrvfX(CdoFp6W~j#$W+$?TpzA3Ard!
z2`IQlMnxNmsEYZBNNrcI<}gad06sk;;qB*x?zV)clM@2DPqU^)MMT5X-3w0kPH=JP
z0XHW%xY)6<G6FmUFsSb!#Ky*9_(-L7CPWCE5%F<mHX|v#xWqUlj7~6JLZqq$nWRxk
z7@bG}ni!2_fsmjQuw-8p0+>RIO&0ax&^GPUasLZBYdoy*wEn-=_Tkq91`dL+pZfl&
z{?0O7Phk=X5PELD?r7VsEfpRi*Rmm+JVQm-n$T<A2u<2GLbJ}?-hofUE)CGEdt)@|
z#_cwI0y@y{I`UZOCV0fX1zH8raz=K9P2cWt7-(n0exMVahxx%}WH`d64TsmbzHmv7
zfn8z*d}j>B@NM%EziSmz&mP3sBfBts_e%ILjYpeg7qp*BxXtT=&P%$&b`w8uzb$Nz
zbVavQw7G{nz~-PG;$Dm=@_5tN)3Ta|h!PYLdPTR&2`@pf9EH^uTWeHa4R3xzuZY`f
z|EytQFEp^LRDQMkaw?QuGn}osMsTb|UU?w_S%u6ZS{T_W!Y@w)%L%?hMyAH#7Euir
zD?w<k0bEr@r3H43VnR!RliCyb6l&2`J)b})@L84L!{F043k=ne!S~Q6Y9jEwyaIk+
z8Evlm<f*l=khYhFj!OePE;THyfX7$V%hFmPr>Lm^lMt*k7FWxh;z|Q1y%qos1PQ-<
zg0JfK-+UN==Tz^hH+l97DsR!w@*HzpzTQuDSbB>}P&xXzo?f6a%e91}&hIUh)ZFI#
zDj22wKSflIvevm&fP)ZV>Ixbl0|)}Xq+4)u5M5I29LlrStH3~O%H6M`j#;9mq@
z9cq<Pg)oxBQ_86VUK6RjtlV_ICVUFA2s0{bUNg(^utLH~qb8L7M8nbY&D^<qUB_#!
zj>pQ5qR|-YoGiOq$7@}GtD5`tup!j=zZ6n-YP2xJ%?QBqay{Ur+BM-*j;VLF!p;&J
zjgf9iG>~?UD(^Ql_BM_7cj?NNn7eQ>#!Z+?!KRrTBgratRq-bP-Kg}sBC=-}3>e&%
zE}(fUw58n~G9n4nXDz_U_@oD|9-%DI>Tf2HX&K7GZ6>o(MLi=(_E)`}8JgA0)5iy%
zUOw>lVS#iZ=oHQ_aCdcwAGbwCL}BRQp-4<1=!}{o%w%U72?CrViQA?2xEz(F{TBEn
zQ4J;$cuAwUBmk`{%o<iGNfjytG=+8D`Z@m#u>XC(HMGt8KCAN6XK7u%)%^o3P2qx`
z9s&k>RD1rk`yT#YXxp_N9&f99yvAtsbR#r+hW725`|7S8msEICca5p;nslcMRM--J
zj!n_5hZJ5jJnr)t+J?18*Iu0oy>77W*A0$??BFuomEi-I5dr8iG62pA{%}h4Lyxfm
zaGw+cuheLS%pZZsC1Vh>a5P+}4Mdl5{%AkJ37uyWaEm&_VS@u4b~wOpe|Oj(?TRi`
zd7ZV~(+$Hf#)77T1)5K4jlaDQy<$SItggb$w@`k&9NFc{z(N&RQ)JRgmDJMmRu{2w
z6HtVlfTy`u73K96W(q>7#9S(Y0xEza!Y+d<LuxOd1wzov<C9;ZG(wftl}70aShBPg
zW#vZcX(<p|0cTZvdMyAxtn93z*H$~6$Ir;qa4$w7p(f~+mX@2Ppr>U4x0~!LIaFEP
zr$PSZw3N601}X?8!A&qLrEC-Iw4Pg1Wq>EZX<fjR_0=}PuJ#^4amVumZokd#0)Hi;
zR*&qGYNPlx0Z=)WTyfnUvo7G}lvWww34p3YR0E2#EUTrrnCebRMDq##`al1K(gX;E
zB^rXJ(HI0>UJ)VAheuUit$|>5eJyU?y@fyjzH6YTdOQKIlGj5~K~Ty1R@YV-(B)F~
zN$u70x@##LE7&RHvVd2?hn%3NZMAd)0;80kIwwC2H_MD>mZYH8ZR(yPg=Ynxx-Av+
zVP~?c+|Qq91)gcZNKeD{G^#u*ysK#zrFS*$IyP+Cg2l^MqEG)J2nf=!&j?dCgMxi1
zPVA`m1Ux%}?+G+${zN<Up!!Ogm`d2B5M~iZ;e`=!mLTj@u!BP^5fWmtyXu<3{nmIG
zfa;&-?dNC0*Mu)Y=nW4q54gCwz}eXee!kua4+%#9KE2K4;iD4b2$lpSjV6E?RtTC#
z0~3OjV5G1B$O<+qyaXhrb<)89hoNRtJu$Rj+x6auL8sT&&$Aw{eb(33&(ksLnxYW^
z`i<rRp#FIL?PkP9WEedBywKILJDzCwByDYTqw*THVzi-!ZA;iO8g=CB&I|&tF;!k;
z8!9{drfBL&*!5^Z;I+V`UXS9bz^Bk5iq^9aRat)<Dm+^_4YeomdJuSCw7EWTjrWDi
zC~x!_?E%*m4|q)Sg!eRmcuWa`*R&|Or&0lq4@L*7ymk}1qwDO>uwC8-PFtPexW@^0
z2fCsA;m+uEq?2jH=)AKVVlRvYpRu%17U-49)>6rp5OkVjp@Pbw{5BzQ*AkVqvu4x<
zEo>ESttOt)NDhIooQ0r-c2-eH_!VoNwnvbYx+|v&(6Xee3^}UTsn9ShDzefdQ>Rx<
zn^s-Tf=OjpTFio2S%n;9Lkmz}Pw23)3ux+-r)>%~MkuVxPO!5oLa9SpS=rm7Vtx;*
zI6+R9SKBM7DD(6A8ML?pU!jgErkW-Asx*h~t-G}Ef73=%l@Vse0v~N{)h$|8!Y-2*
z)e1h@T|rL(lp>^<GtjIw$I|xF;>zCU$pZ6pGWk9MuY#XnQcuNpn~IROR<J9pzlX}(
zf8lzTQo@(hKIaj%JQw^uIsA=Pw{98ON%3iU>n~a0I;tQlHG;2@4-X|675K`lt59A=
z6<$RsOJNZ@s^hERH7F`6FiSyD(35Jrb-Vh(kTQ#96|9s2h3b%?tI>7ly0v~!$Lcsi
zQmLT?x~%Lpv$W*L6a4bHPa`k*aI8>ug#~!(j4a@p(K%9j26~DtsyqwyRG)X{#uZ$r
zO;&y0m7CYF|L|d?&YVs6*#|+iwwi-m4mB*ym#WVmF}>_CWSA|X_Xrxa>*9c3{Rtfc
zuV<gWRBZl)8Vj^$=VE~t_@weI5l+>|*9<jk1cq2bO2?#2A`Gn|>#MX>YQ<Eym%vae
zqcAN4C_e)b<nISRFHh6m#nH|djt+KaTyJnt0HUL#Fk-}TjEIfJNCHQ5SPFhgqY0)l
zw51Fy!~`V)$qGPgSfM3Q$;w)xW(A?%V+Gv*0#V27IO{gqVIAA2PaiX!Mg8#ve`|fG
zwei9~Bmg!&?5OabG%d9apKOE%EeScQyhd#sQQ;AMRCtY4hezNwP!M{JYzaLByhqU7
z?NPLFe-tfgYuiN#c&f{@f!!c`T3RRc7~#z50oPbUZlo9761?G-<O$a?u5cUI1D=z6
zQ0Z}f5>@sjZ#Yf#gUxtPbR6e^_LI7x+uTmDTh$eg+Z|!Q#~$7HQQ__HfDQ*cpwoWt
z-(ib{XA{BaEER!f(4xhajlFvxdIVmj><|H0LKteKg`g+<D&R@oDavX}QGKh51wNN5
zr<#Rds)SGb3O>cD9#oGf;Hhpem&#6&&jKglmGG%A=*ixy%Ssj1jO=VRmr{BHpVi(9
zXo8;gwLm8|C)nxr!xq<C=ci-x3oP||7HdnWGX&5ILPN(GyPTdur9u^X-{O{4*O~rz
z>X(;A5UEi?%1Yf+)Id>KUPI8|rW(G(-&sS**BanS2?~HxgIZS8<_eS*#^%=X_<OjP
zna^{0hj6>g^L-C_71ex*Pz@GS-SV??XlXMGOYb+L&?*zYN;PlI$fBsIBOv+REBP>}
zy?w_dC{T;8z^6Wb1^oT8x0)p_w;+#Dt1^uP8kwPi{pu^J^i2X^86Q4UeAaq6DKdo=
z+O6A`WCF^da_`SNV|S~0y#+m~JHbwMc(?D=n)m4a0-pN#<>Y1%lu~p29|T=xWjP;e
zmAuA;bvYj{{4TP!)}*0U;ALdV+WrT6%C2%fgElvv>r{GYE?ht?9i)Gd*c3&j$MP6R
zmW6oHwc4QnKnKLe*<s+ImT2JO5r94ehZ^7sRsx>-wS-WG349v2rcj4lO<Nd5C@IX0
z{IYdw(dAMJ2^7J6pL#zB5NrX;!AIByhx4-{7?J!u%?lA8f-oKv!DAx`3@N@~es%ys
z=u1WD<LzZq1NCrmG2KCwgHC`^s-`}D`=MX|0Y-g23_d~X;m~%46>8RFwa*Gag`j2)
z>-+TCR?z8v%E;TRS1)*Zd704f74(!fMbHc8IrHZ^bM|mS>&|V_=!vETUK0Y3pld~I
z+lJAOfRoa@Z)*+oXlWbT5PEh^(Zs1an!7%NM?Igw<GwA?I;<T!_oBiZXlKHHumhDH
zVHW3Rpr@r<ygS?zJ>f=`=Qh?A?&F=Q^c>+i)d}9yxqqq~oTqreezF_7PPRv<sf5}*
zLT|ODe_mIrx-NS<8t8R8*a=+^bSDU%Fy_)&5JfB?ycwzRDm2{eF5^x)3lz5zR&}VS
zBB?UKlfqM4DN}z(%cDk&az5#+j4Gr>qAC#ZWOFU+RD`!^0SUTdKIx_IidYbIldA#4
zI>0<PU-fwc5>*8)td=DNS+VM}G;)CjS}M?LWgk{`+Gd5JRG(m`W3<1rvXa(T1(E8L
z>P~fh0;7)AQlDe3CoG@>%#jk*I9fGkl+$Xe-mR(*HB@3+l6hubFe<H9{T&|)RkWo9
zmaM4MpX%busn#@fO{u2x8F{qGN<Afj*4+8q0JM_8t6-E9Y8eH^e4pj>l)ylBiY1jb
zRGk$jAwfm$Ei*<}3N%xeoPaLoZ!47juc{{SWX}m3{syI_(%9V`Jpc;%8>y<4$f2f=
zaOT5B>&kQ^7?xL-5r~z>{{F=P_&ce}iZz0Q*Gjfjb#>M{K7me+98!39@9}*6S&u*e
zQfjF>6V{<+8qHxDV#d!Q;H<!_sZnYv-ZT8ay6#F8s!)T=1K=^z2t5tMvvgOnwB8DQ
zn&0>2*|Ui0+aEpw>WFM{pZ<ZqET`@W2<(a>L!FQ?%8uo^9UAzn!Y@PzNwui^1pyN#
z;Qa@BQhcE-;7TtQO0^ai7J$%TKZG$td3+e*W`clI7nTq~lk28av-<ckg87+BJs-db
zB=GcEVGlx|t^cK^{-5f~_#SRkcM#1j;X{k;MId?-i0TNePQca-3Q<u}MuiO;G{_A2
zkEKGisyQjQhgF^xgjUE|!KeM!u<q062zI_^6pFi93jUV!Vu8DvJc;4z<%YIhJD~Z~
zkD}odP0;9Rg74|ZX!s0)*Op7FJOjKARC=<u-Dqv?nxLseGc<K-M(8y&D(^}Er_m~;
z4LTBd-3JhMLmXjCm8Yf4NH@5~xuZucp_jlI<wBK5@Qoq(Xl>mmxEZA<=y|3(!)1yS
zocP*)Y7caqYKP9#3B3ihw`*u=x7wiF9)fRwXLLKnIBEmiBaU#|>w@GL#{D<+2)wd;
zjJvdWx2ueTlbtOh>@>8jkn4h;U{`mq2H*elTP#|>5TQ{aXsmSbjR^b34Ujq~6<c<1
zF_~L(X>SS%w+}!55Q~;Bf{VKgnms~w%;S{cLL&t}`|MM~s@$}`Tf3or@x>PgT55f_
zc1%{N<FR#ot@`L`n4!R^rS%wvh9C-p`2<}tL2&TUVKYuu9iKI7M_pvfX#>?7tl^DA
zhYm%v<}L8ZqmT3bjWK@WRIJ~))g=AgfB2Y50&4owDb$azvAU<Ii%V0aOrAzq5t0O>
zl$unV*2@UMfBt$Ei<YfMWK2IxPCdP+Q4>s?w*Wg29_I1NEW^)x&_7W98SgI@rW=i3
zH{nC03bl9sBmlC|uqn?WLw)y}w0HzR|Mojo9wA2%UZ--+C(tMG+*z};C^Q_;x~{oh
z4j(;?`rCE5efJiBJO2mITQenxFz42CoT9P18Qou(6;^%VM<2hB=`*Lo-O~k4nl&`x
z?dxH>zr6bTOH^^1qeEF#cwZ5;3c*jnliF0K9X<TAa@9%N0zAP^aWg~XZEw=r5_nqD
z)?UxJi5#laZF_bjI3g1M!Blub6eJWQ0fFA+l{<OTom_Ba*|%Y|)fAA?gix5VwU)Us
zR6+00!Ywd`8e6M|45>KHk)iAgZf^E)bM6k$9yaiHbAYdhGXlI_5fb1{<raVl4O!D?
z5pV*iWmYvyJ}ZkA)DSJHr2m8fC-r3MOlZk_rO@$a-~EuXQpjcse5zj)jC_229)w<N
zzrKG=OpK|A(*!VronZE`(zAwD)d_SuR#sks)8`3zvb4&cqR+IP8~!fYb}Cv=A5V0#
z>xM_3ehdvBrFx^v6Z9HA!(|&PIa=FB?HL_;6LjXyXzKC=y~b$f)RfR;I5)#19*?3`
zaBFmkrWzaU2*+VP7|sTGT3Vs!64!$Y&xycugliHNU9uP4Q@r6Z!3&<WwVqR`kf%Ds
zmGE=s(mvG@-BNAPWmZ?%EMgFPj(eS8f0!2cs4eV|+rsv^1MH7D6MC*lzLWy$Bw8c_
z?!TZ%1t7Q)daBc_!!1<*S!43$mCMr7;-=@^q?!+6L1tl40a1rkb#86J0z#looIa6C
zhYF91?%ns_F_5zsoCm@8sgQmB^;ZT$>S|;iP4LY(-xw99E=X4BS%;%3tl$&eqy*2L
zIb&X1->-+5vA@MsoP^%tqesmD)c!+<4)U|}af5|QjchtzBQvx#1N$5B8WL{fCr&{r
zL385FIllKkgdV_2@Dc=@V;rF;<);>34Oh#fLes2fzg<s<tEVqNPv7-Eyp%p(uxiv8
zDO0BMT-KR&69#YsQ$sY?*kQBYm=6@q!_mZKQ?9JLjh9|mjyruH?YG3)i_e-Ie2St{
zs#Pk&DP}^UhIstRC%Ckpa|^(%0Nk}_7om6i{%^O0l8suI)+7Z@1w}5aYAW!LAHVy*
zUrz<E>F&KdjAhQJ@)Pu=_N<{Hak93m-VpGtDo>Wy3O(fuG_bQmkINewX;jPwShQlf
znN=__G#q|`;!H4kMQa=ANEg|eqNo!_4DW!%QG{MVa1_EKqv7MPUdz5_T0(!qD_n31
z3MN4O2sw8%FGi=%?eR?OC-GFvW_afDMri+3Q*?T!1>+HPYW*0xb$A-KUE0CEdnfpM
zQ&#$U8{2Ayr&@o530PAX7*22zc7BARZ*U;(AyuGkuKq8fDA-x!e}QM}(u~^Tc4NB<
zRH{COV5iW!fau}j0T<e2g(nrF6^0szsBF6eq7<UmHEveGbaQhvpmTL~HFj42N0!$5
zKlA{wKret#F;CbybjOpeTcYt}P0;W$syafiaVrb-8WMU9X>A*IXiVTSx~hAM8ZUVB
z*z=}wq{?`xdrFs>E^rv+0FQX8H>2uY;55P+&Qy0UBQ5sUHQt%PvxDnsg)3Z>y$L;k
zc#QX_(({4WR4>|FPk7Cw?VaHUhiQ)JK7(MJZwseY_Hf!{j~)m4xyPL0c$_Ly&^yNE
zAwqArE5<%Q4h4i>oq=8{p;toaF$lkEs=pdRkD!yi)wo)%SKn3ljdHyI@q6Y5S9tk*
z;`o_kIC$&;b{*V>$ulQo&E_?<Uo3!Bc^;l_ra-8-f4_cx2)C)2I(0H1K9&i7(rIOG
z-MVE|onAlk$Rqgj%P)=Xw5EU;@H8Y*DWQ}aOrdp+)R0XT{H#zEz_k9t3op>tQdQ9A
z9XdiK$nVg!X%hoIWed4@;hgC-+k^nna)ueHGX;|-P2oM{joo|pp`7;d#F?{vFCo#S
z8OCwn;p3-p<iu(0-hT)Oj-J4aue}LnU@?9BXmbmw`f_NKUwh{RzOM;Y(4%nb;f|xn
zPvhvx)7W$HkV!JIdebK4(rO<$bqc!=95hKbX3W-*GYdc_PSO85X_V%^Lr1APtBv)Z
zw`d8~9aW>5J<S5YDbuD?MO8zS8!57L^D&X%)dKN`*sy6cjvhPC?{LhFakl`<0yH%w
zDS?ObOzPZBo@_bi%8oN*)(jjwaSTU}9l@zHr>OEOuy*~L2M|n78O;aE0W&fFu07kS
z+>>$U{3+xWWmD<N9{)G=r1CT@PPR5DUx^hSs>dVj6c0o1x@<1j(|AlaE%}UjbKp-#
z_y^M+vb>shgkUe0XB!0gx1$Jq5`zb|Mp7c}t#42SeK3`aS|81fXFf)$g$4P;$K90>
z>x8yXJ&sn7H$%Ipnxk9$C*atnCA{t1Ai%L5{2kiBv)fbX(eZIOb$A@kom-+u*H-A>
z<}q|`^$0pVMIg3%0(PC+!QIgwfnJ^nrCn9Mp`~$ys*W~T_E(`VtNK7GLB)UIC)FqQ
zCzYrSCOrOu)%-u<r_eURN~%quQ=Mc06|EnwvcRT#LmL|^pf0q1-MYcS!NI^z>Q0|!
zZLtm_{59rS<M=GWZRVu<^NH<(HXU1|*%Qss_;CX5Ny6?)E~)eyx1!Q(&7i$)Ea-J1
z3<<qP-5R2)eWM4!dz8R?%C99lM|DJxq0aCg<qiKa{_sllqOx<NvLm$bhie?cM{Dao
z+TK9VWi+>sA>2}^!YA@QQ~ap(d}wh!;XQ*2kl=Hi&fhu94vx#5;IYLGUVEt)54ysM
z%Fh0<9qbO<!sZ~Mx8E5(cDp0x(s<h2JY#RA^eXT0#w746Xm15QDZLsgJgPm@h*4*0
z#8BV7qo<CTtjSV%M@}9=c}=-#_^7{EZxo)aZ7HAjzx?|1g97C3?L|9z165U(P`^BN
zQ>m;}#Ws&6;AC&Dic8RwQj^ltYlTKxSXyN*b#Ypn`agk#(9<ywTVYjkD4Vdrr))ji
zrmQXp4(vDod*0kR26oCMa+L2^Yi@aY1wZ$G!$)a39|(7F@W@dEmWG7P5kjc2q#Pxx
z^DC>M`cR8+1*+=r(tfHQuF`}O6YN*oc-i3wO$d%N$jr_|MNJ)w%PUPM=mJ_%bw|-;
z-m<Q$qua;tY0bKP^u$R6I_2=w)RF2^qlOmCP`SqF*{d(&MkVriDMaPFyJnPa0Y6ho
z76i>k8p%QE{qW;YsH(1^>g9f(J56XMXi4#X`q`(ZPF3eb9fpq|Kgx%L<va>>bxKlN
zT}740hhuFuZLMYGg%U}8_T{JC&(AF_q-D+FaphcQo9;2{{H(e`HDs75FL_XxCu^IX
zr<{TRsmHSd?>bf8)wCP5wHAe^kr~P@sF5hs=gcy~$=g4`G^(2Vyikgq!0r@V9a)Ya
z$AJE=5Es`P4gA8Q;7j!-Ldt@M5O%?SENr%&(e0VX(4}QlIJRy9xAraI)u{#ix;%oQ
zZjZvhV>1MHYL1Yuk08SKaYWiZjxd`?5Xge<->ES|Y#v1*j|+Eb1^=#3!n4C;aD1u>
zoLe<VcPd4j_N@ri4sfO|bnoE+Z;u}E^KnO@KP4%ZrYyEnS!v{l$Q>Yc7#4wmP^DW}
z*BE|RLQ%uyOe+o5mtdq0(yE6Ps1*MJqn5JW_k$o62o+|3kR?KcLrqRREklC|STlZ)
zDo?@f0VedFsDgQ7?R(gxRfkq+{!}wGd4fQDydj!V0k?Rj`G5N8wWgIN^cr^&@c7!6
zuL-?ow6x88G{<9}kD*g!C-fNX43Ah3c*fJFKBV+|jHJq=rFBc7s$;l~a)o;m!8fJ{
zJjS|F;d#Jgq9;|JH-qnGc%}Nle@+0rX8RcE+0G>-*Sf-c7x(Sw|2X7{9tUWj4>-b^
z3eRC5V>h9<!<En*hg?EWAgiI;lg+KUU2R%nRi{_UP{PnswbXK{*UFGwR$GSC&z^pu
z^mgsv&G*%sMvBT>D#ZF4(<)nDRgTX-|LlPk71Rm}@=XCNr20~nloT7d)z#HKP;~;=
zS6^w4g)%d;M%xsEp6dCm{qWR)QL6bQXz7$MQK{t{-!J412`R1XJys=XwZ2L{mYbJ{
zsj1VbK2^9I;+t>3Gsy{x2(b!+rj$S{E+LEww}Vt#jm!*ajj3u*(7M$TEQAF2X?C-s
zvP#0KmI_9#qGbdWEpBl+Rp3d&f!~Mk-MM!^@~H-ksRGNYYPr9RU@Wrq!{h(VBG}YF
z@DQPDv93*U{Nx#<?8>TY2(dycq8z;a?)&^6`W=n1W$SkA;pgi6YYyMH-+jlZCMnyT
z{5(Q$vZ=?^Z&ni4ni|4M4MYqL$Fx2mlzc&5hE(S!I31&^GteX0RPU!o5RIDAvX<9Y
z`S-N1annY){K-eAo=#CjpeZVO-PKh^-BgUyBly%Q*)p%KWzwJj)aTvrrjnI~o0*ve
zUIwm7=?Q@9pqz2d(%6xmj~i*(m^O2!DU+rVgJm=*#2@Zn-4N5)5m7N6;O)~4A)(C)
z^$uvD32OX9E$S(RAn~K+aJTOYr;bm<fv|IF(;VI%n!~SsWB9jkfKXbraDpwMO#=kA
zYk=@hjnK1uGxV{4gwX;~T^k~}Ljw~5+#kwggF7+=%(hJt*x?a)@-tmKJV|BP5)SRC
z0;vjZ+dU3DzIN=~3QpbHn2E%D*mr@elMQ^Rru=-pC{syqLN0<J3?iJ>SR#vQ02wM}
zW>I<;g=PVtASl%qWa{B$Q!OC~s_xLAc2~!#u_in$lE9<lCFm3aUT~09r3HEqfX8F~
z_+DRr2RlNqWxJ=)RP}fSUgIYkq50Fz4EP$iB;cN6A=5|<La$L5zP4f5H#UtIEnFT!
z3)e^Sg!hx^5Z(bceQe<{Kts)(;TlVY7wc-EXB1w%i-Db6BEdJxh0t?_#~3$wjwAFY
z@S06zOyc@vcPc&g)$@hV3_rL{_kaVT=eWojo?B>p_tFaQ<#GF|_VyBbdz|2~*9mqz
z9pSix|989W?O5axds1<Nou1f=>RZ(hfLHTZ9jfke+n=?j@0~gy*ART~f6BtAf$D@_
z!)A@Je#-{DfBAjd74-unBpGTc{O-GNO~FzKYJ!ytos^x{1thi1YMWHuqmR<I@z{?)
z{@4J{s`NCb);d>)j#mq<rHc!d7Qt}v;9-JJg}4z?Q>Wk0hu6@I+1BxA&(fmub7XsE
z<<y!xN!d;A1LV`szCblqnELEV;R$|%UU@||ZQ&^%(*(_1JdOhgkD3<VBC0*jmm&34
z!1XGPwXL~Lu$8k=mzd=nvbTbzfmmbg*s~vRy!`=wznVr>S4Fi~LVH_|biyxJ;H<rc
zy+@8&>h=iOBgamf|5L#4A<J6C<)$r~4Nh~2Jc8F=djl`O{3==~Jw1Qx@>MGd)l&X%
zjW8)N2|cXm{+C~VG3VTRuB}xGO5pm{S86okHRJVCmy+Wrj+&|nf$jLoW9H$dsJ*43
zY<bwbe~$s4_Ivqwn%Ang)y!&21gaG8bp)hTqrj&T8tMXLGOFB%o<;2uda}1l`^5)Y
zT2?lpmr39We2O$e?<$W|hh^2lQt1(T8vm}LXBsr%AK*!6=!D3aZWJevA~L!)A|ju}
zprNhQkih~@wG%E7_<O_4sXLrIwSqGt<kk5}1ax~6ft??LZ`%g&X-$xEDWHWCc9C71
zBF0v*d(1#D#-=GEyAXPWV|bSa2oxlx`Ut)7u8$&wpA*i{3+T`SKJA*qr+qW{b|g@_
zJ)qkYCj7fU3BT?w;o0>GxOILUuALbC9xk%YU7ms~t+TsrJ9xR+!pq$e{=V)6l&@(;
z4k!5ZJ3^`4EVh)&PvBE<o7Ey)$H-cSjzt(bZ0Z?#yZ{$MHE9wLJ`|RE0)2)heAS{H
z=m%S8J3QX{30hmijL>V^l7)cKYy2c_&l9{s)Hzv=8B%(*wi<QO*n!9N(D>R%X>A`z
zOW&u^Hnbf&M07x>=uW0@o-=K((=cZ^kC4hE@a`)-LC@u3=#6(b&~qJ6FixQQBlz42
zKA-7?-Yh?O%=U)!9D;MXE4;Q5di$v2_we<e9_Ybv+Urd4QRVG&;&xi!-JVGKe}$e^
z<y9KsX&i0s-}R`uS8JeG%Vm0AI(&kCX>Vyq?<+nNEC@<YDfpGMAbs`Kmk$({l%61`
zLZ_vmrIuGIklNbX2L(~btMyjfHMCIMWP7cNK_AYJ{JCZiBdCrZJ<it_fGJb<xN(|z
zx{1l!^3Q+%)3n%HKSv2>OKByo&};F?WBBZguZ)_K$`kAazN+e49#@HD$4^s9X(ly3
z@ETLuH+$5CQE{}d2aX&kYzShiyzD$GDXy1R)Z*&R4D|5uLZb&c_M}2Bb$qktF2WB#
z|7Pm{vZNYo>#3mYvHQSbG}gr5f*w_63ID%FizxC6iV+>t+kEH71ml$}*Kqafb@N)#
zi|*CSs65qwsiU%#UgP^I<v;l7L;jv>qi|(!HFc@3mEs>iesB8n=|QDByJLi&b>Hz*
z$B{)q)l$i;QA+Up^ovg)?6-y>=;q<dhmYnPQQpFQUQ-Q8)6g}wpj)74o$c&FLxusL
z1$s9#vkdI6rl*?_^loNeH>xl_BNx}NXJH!ep}^okQ-;-G8W`k-ut--#^y~^x-{!RZ
z9S|Ay6#Djm3Jpw`5h|7-Z#VcjcZWMwmTLzBryZ?ox0VR$_BbKc1U}CYdaWA3kKhZW
zg7bd50sN`<!m0dv+B7HZ8Y7t7LkPM^Dw^o-O%41)Iy5xL1+&lxb!dt(!Y_!Qr_V8J
zk;jB}ZbAh~71*&ce7Mb%|I4pqQ`+dq5WwR-+BJbkd%lOS{kuO7kIs)0fGyFj-DBw5
z_Azv8|0JU&oNYS7$IS`eZav`d?TJ7?U!z`)wI!&G1s1gQMBwlCr)pHIwXf;`3B%xU
z!Y(wN)>)9V#(mi7J+#HD%L^u4^<M7)Uv#zah89mff<})uM3X0};0V2@wDAod<3(3I
z$-+wLHST1o&okXrsPZ)OqJ`Tdc+}%DwDfx#Z9>|jb!Z#34R4Dsy}H0bEw)1);WXUo
zfweWzyAM7$L66qfYXYt6M0Wy@ARAAmHjzLi^t@;Iz;~`cychVwWq~UkmpY=y27b;i
zzIUGsoC!I%{RHG;PZJ&oJ>h>e0D*@BG5*qci@m*9ZJ<|sKeTQ-Am4{x&7XBBsiEz=
zRf*gDQ(wILzv@lesLeaJz|qwSO&%eL9%|9+-M80&DLsKqN>55dedz={fzR46Pbx&J
zPD;!QJ#ABR3BBJ6y#H;<{rF?0l`5rur9!$7gTsf9n(0EVb3iBuqb#i&Nc39WTcq@+
zQV|*$@qJ(Nc&nwA$}6v^Vgc3I(@JOpc)`xJ*b-VM1hD&Y;>>xJRo2ncR~g`GDYd7*
zen0<y1?x6%feYa$kXeH7Yt$UQ2Mj?L)t>t3mDk@!X-zFH@o`I?9lz7jV<#*&njlu4
z-Z%gF(Y(JIAsQYLMSE`<Wzwf_e*<pS>wWW&f0*HDipi<U(`Qj+o3?Jjo_%{w&b>uT
z{*y8Ju_qog7E?n31w6H`9yw-d^wC4?;L(G;A8Jj4i1L~W11r_begDIEn73f=g9eR<
zW5b4ZsIMm^i`56R=z+EU-^si!N>9McG99axKk$E{m!6$wpqG}OgKO6_sPtx-vT49a
z0pu6pN_*Q8L1FFTAN(YI{2pbweF6gpwMByns-$2q7c+N-YsZ$fvn}A(rWxD_wLn@@
zE1(3rFc#bpg2|%Z2r@=wx271-qb2$}K8A?SjS$|kA^K4Lg%NfEgjXOT=1W)#e2UO6
z%?Q9ow84!L(TT6A8pHTGS_V=@3WQRVeq4rhX@Ssggk)#c9nv~GwnVfOL2UOpyt_OC
zcYZ$~o0bT4Yy-dlhqb?ex3XHh{_)-2-QC^Y9c;S0yVDlC3oIlR!9qo_JC3bKJ)W`=
z#ZD9fYy8HTYfs$raL)6-@BhF2T;4VBSZlMsYm6D!ObbUzY6JlFKnlN=U?>v9d=MM#
zjg;t6WT(cXfEKi*Aji}f)>S*Y)F{W6e1A39Yw8I$!mxsHlYK2`RM5f-gxV)%s1MXJ
zcv(R>5P$`=+|h|KaPxCF(37PV@SOdra(wuQ7WAm}Tq9iJMzFgnu|S$TJhB`eku~zd
zo6rj=4Mb>7xPe||LnIP96A8U^LXTE=IAJxCFr&TAS6v=~CzY2!As2;H@=>I^yqN{a
zn?VIgrDvcwhZc815sH@3$}TTJ*2-+8ot%cub25-~8Ns)Xz+BHCx$BBizOkH9fr=X{
zQNN`D^&9Fi=g!&SKSc-hXju;(b%cT5&-@aYIEbHqKWu=f_>Q3=XHt7V{QM)n{qZ~e
zc=QlnfAe)*bm>J@F;u(#NkjkfuYchA=bz&jm7fnoT~Y)(*;duZ34V&7fBxCPPk^&3
z?&X(XHv6plQz$>8rUOvFI%Ut+^6j_ZF<*Q^PxH|V7E*dre=RMo29|2sSAto49RW}k
z{&VeovpnbVC!R9OP92#AKiS+L4;{vL-~ULvr|vNPax)e#T8eA0--Ii#z7Csj+-_9f
zQ_sFYW&0UE{o-4r_GE28`BIG>Uz<jVufF>pKmYbS-g)l>TzdIc$F3z{zVhZj4Ddcv
zJtCpEahtlwI27x31fbNN6r&nHZrX7x6_rB;D%t7R+;@QBJN2|POnEm?FI-MV`yEyD
zN1uLz=~Q|Gp{=*nx&@P_<<xTR*=JGVX{g$_=0{4<yY~8Pj&(`dv~`miK_lpWN_c%w
zMW_~7flKO80DSP_2eA62RgQBA${dOjsXWs~g%($#VQ0#yq7fJl=&2Dyb$R=!=G0B)
z1Jl3o58!c|Hd(m<1-*CQqY8Zcebbs;UF!gzNf=tLq{yNC4oA}BMiYP@{GJD)Z%90x
zstPhtnh}rU#8A`NP!!=owdIDwpni2m*p(A{4RIc5N%BDhp(SM|i(4wF@pxOR7rF?&
zCPJ^4+f9UGGZmm&=dzAv1;kpyttrWa*AN^DI>OBa%PFb_x=3dpbEjgIa&%bZCf>Ix
z$(z9BK7z6)!3AyE0jN*$LNzUR1yy1>pRZgEEVReP@%|``=Q24IMakjFjtfRQfu0^4
zjEuNYq{oG!ASW4x`ROPv%|%5;5h|+6_+eENa<sy-!1Z;gpfWUBQ~DKNRdpGvSx!qw
zhva;O#e|!#Dr&?K^o{{f;YTGC?8u$x9`6pfWOYV%Boy$Z%JVMrgHN#^{L2VEsl0|L
zM7P8svC9Fy%%K^`8j(rp(Rz-S+RH%!p;tiQ<xk>rDwW<0Dm_9^wl;4j_Y-*ea|yl0
zMaWy6kL=|RYg=$0E$@}ZD7lt#eK9IGRiJ7!-_sVpx6O<js?of)87-TdvFM(KMBZnn
z51t@)h!Fcx&?5jf=f(GoZ`EIq+p@cdsQA>tp|S$scRy)F#<v9JA!B_GAN}drTnvi8
zJolGB)#(XfQgwo!EUr|a8ZcBpC-_-aC#5Io36RQ|V?n4EUG0+s)H$jXRGps2)5_jD
z>hW9%wM{(sE%xrc_t?GFQg7D<J+)*`)7%p*uUvXhJ@YI9_k&S*3V}~h`HV{Iy8b#n
z#f{r{^2>h2C@+CXVH+|&r&SgBR9~l-Tr;;V73G(FZ?et;s_F=5&Zffb@8taWv(HgE
zAEd>l4J7oo-n@hR9d`8^D%LOf9N+zT*kpm3IzwGh_+zWSetd2lo?bq*-@2ZQY0dpy
z^``v6n4v}>DPvua;TIUlr2~Af9_~2fthM;zr$e-?{6RbTJrydSkMO&8!*x8*QEuIJ
z>&-|S{kSPJj_UN38An411;DSr)A(J-urm91m-_ok$9XNOJqtalJPkb)@HF|60iJEK
z73kh0@E8VoRDEVxUB4oHhv0wvo%gVC@d83kVx)vFEDyzH$tWm^q{<6NZDRyVD*RE~
z5Q^4zby-P|HNY#03qWbKH_9SuGb3G4LC`f4CN+eS6k9{A>gzlSE;j?c8Ywe@iuSZ6
z$qij;-snm9!Ju?M49N;aUxq)rXnT87z0ppq+mY;!Rw|v&bWgM;yK<i^+7eyRnc{&i
zLbEN=1N8)64Z&AO;MLIzS8=_TO0$j1u04%PlCZ3c6F8mGl1SJl@!kneXlAtVdQF5%
zLn0&D3w4BKeX2idlKoH~?}M@gUsR?9qB<=E6{%E!vd0nLh*4{DOfU*^Q&C-6LKRnu
zvdVG?>}>QyPro7qm0nhU4uT^?jM8)Sr+p>(4D6IGSSrt-b}Q5ko-wque1C2!j<iu8
zIUaD&^EANoE%Aq6SpdT7A`sIWi@1(>Bz7kuwJ#OvL-}h(=;e%((#xXK%R%AzJQPmM
zqqQZ-rg5FtwrCdB-fY5e4r5*c3Knp=v<UetxPD3@O3o`m)m4?KA=s(~xh<8b-A22=
zg|W2;wOeXYzquZ*TU*e+sTIrbTSnx4VHzubB+vvk13sy^Uk^IKcl02B=6@!G3PJY^
z|5NMjho63E8a$-(zB}~2X|dHmezoj*dV82Uo)=$!(ac{V(8!jmKc&*T3s^^v95G7)
z@aWN_27tEFLXfj9wNi%`R@$d}K!H!t6!bKl>>vSl_0{T<LV&rr8GHLJp`=z_byBv_
zRAY#iQiy`y>NO`D;ORVBNu!)-X$3zEKq<ZT*U&n<c);1s6C1Z|GcC49e#J59?I+x%
z?9?!!p=(-KXd>`0zg6~Qe#SrjWYnV|Dz#`J%Im%I#+&$zWt2in9hEn1p{g`sa>2FN
zU2j@%HS|n%dafRXx;jKt>ACf*TB}qQQhMsXqLCQ~j~vFlC5w-Z+Iado+IK3>wdbB|
zy2L1Q^7GAbuP?v-hQ}N!sg$JPx*Pa@`vG?C#_RF@PX`V71U<!m!b?APYN6MUogL4s
z@>2R*3VPZnC8)ldrb99TrfELavaHFEjtx08;BzQFg>0_s_f*ekCrJ|U-g@U9ELtMy
zN#zyt`<sQFf=Hxg`y#i{m+KzfAA<7AK#G+hLXS$QC^i6v5uPZZ4K1RrEoIbDfwd%f
z7zI{F)hGDXQTbI9h|Nh}gqPHvCpyzS40IX<KdQIh6i*Dz@WY6l0IqwWn=oqS{&tp8
z^zuvJl}ZatXmt`^-2_!{swX-U+|bB-)bqa0YWU!?fe>saVB2V++XyoOu8~Twk=N_w
z`F&}+pJTmK0F383JZ6AMz_jxIO*~hv%?(MOXeKzDQhiaK=uP+qp*kx96&bR`K}Zbu
zK{DZ2n44jS)2ZH0U^C6&{o$z7bA-CgWaMTOdZBRlC(r_@@@R3*^oN2TRh}}exKlNI
z#!}hw2S5ozHTuFM*AuQe?(iz|g-?l}fnG$T!`2FTNxex(?n^=1P+HOvw4_vdx$2vz
zdOX?MNrYa1t8GC)^or*ep?DrGEp2VVq5@<uqRm}hfYNhHQGIz8YOf{mHW1uYc-32J
z=>@#a4&XIzYc#;?y0H`8n>(=bz7?SPz>l=Ahkj%H`jhErcc>qF3RzmIyTdFa1*kE#
z>eBMmb5A0#I1e-C&Ljvnnz-Vs%Z<IU_Ey~-)CW(aTr9XcJ3Eghch!g)?Q?WJ`O56K
zb#>1^`|PoOwjo6Go~S-f-A@EAbx^i-f11zOe)+A^`;K4o-Me?2bJc)hms_@Mp_2Os
zx81(Wly@d56{_R2kzY`R%dWVJU)+PHQ?aeTTej*XY`b|UZXo<@C*`~L+-LgcX?(7_
zsz~W+x%a_GkY8Lvn>ia-u3wK0e6EWwzKr+e{Y)L9W0ZxI-j{^%H-~=0mDi{~&Cvzr
zy6ZO>MX56P@>=E@cy2>etLa9g`FR!B5!fwl9X#$(bh~!nW71E}Uf4gOkgFS>eeore
zdj7)$pJ2#{k^TE!`F>Uz&?!5Ml8_33*IMYgIOF=wwAcjK*9X5f!@$1!{u^9;#igik
zX~e2i))-ZI-3`}c$+9JUPmU2B<0gzVcKM61H2Om0nLndKR2ng<Hr44lp!bKpb%bDN
zp{JHyB~j2cm>*E_33zY4^Ok{Lb)5!S<e{`8jj*GLATP4$s4{cOmX<g|F#>I!NpQ-e
zr7ei`qOEmDX}GIVd4gMgv<q5kWgDpIS_q9cg03ae3msHXs(<Ur^v0kp0xr)NoyoMD
zgi$N^H&PW2<(GSOmJh=l!)fOR6HFs%>H4(H^}yHyZwyIuMjJudK}d~axk*K#=;AfB
zt!N?i>ImN!f^Z1c;qWXk^bl+UXbbPt&GlhfZWx~Jj-F(KlKTY8COt2&)0r$_5`x@r
zCJbAsDy0UcO4aaEM_a7^g)P}(Xf23EWqJg%Vgif>&dW$dy*fx&QPI)zIyy!7x9~Em
zYU?PW3BBwrLNAO;kH4Be{PIZMN$F`eFxp&~U{|>FbM=U&(xa_+=MRJ%q36cWz>}YW
zM}Zf-i+vGT6@<v<Xv7nGNxjKPG|)>%>d+Kqj7UfJXsW$2S;!lgZB$<26o<kqm`UKx
zCg>D%icE*({CWAvnV*aF1sO)=RbIsBzk+c!mEOi`)KS^hY^^rHtKU+Ox=nRx+0u-T
zt?d}Jts6tO^<vdt2lRgU^)Qtlp-12edO!Ul)kn+A5a1m3cR$d6ePtRjw0-${et8Kv
z*ZyC2h3&?mUwc(@s}r)6*`7Un3~+1+VishI=bwMxD7}HI?u8fBcg)d^Mcabaz>dcQ
zRh!h_h7FF^TPZ!wTPx_Pqp^Uh5jVCWL{L?|;5SlNw45VHjWNJ8O3A7xhV0_>nX^sn
z>GhkWz8ugqA$zH1zeSxje-V|>VFNxXJy}~VFTP@(0gh2_TI%`wJ1a{yu2^%5ft^xE
zeahz%{I1_D!0H*Cal^*Vgw&@5%?~(j?b(idGS*#v9Tk&OJgGn2muAS>)|<9F&SRW^
z@g*h$%fjW$xvjsCE1r7xc>_?5&3*07zau-p;MlWmzvUK!?;tJoXC@)(l~=FlF<y(%
zyJmw%f_y_@erDjf|C5g~Vd`Y_w>N)xJGPf)-DsJ$zJ2G-{M^(LSqjeqJuR(jQwV+z
zi|gp0XMrb+t92z%(2zRKhW0Mq!P`nc@b<sVw2x}Bt*p-FcQk|Gi=!}!MP5+=QZk$g
zK`*+tFtoH2diexiNepdmv?t0VTuuF5ITc?Gm(^SfBArx8-6=j)Xg&sjs)uXj{x-s-
zhaeM-hETl?Pjkgks;pt$9!o_xCfx<&38o?3H$0X5a=4x6iW%kpm`vF@g2xAu#v?P_
zF}WxRW3&BCj7;}NBk$e8dkjr=CpbOON3abgC`V+vVJOd&!W+W%Vc8xSn&m-T?P}Cx
zD?z7a6X7WJSugdd&^kjPG*&x75Ok<OHIg)BgrY7h3dJd5$W91FVP-O_%8DF)0;{Xd
zcs?a&sH$@$XecSGKw4&|fu6fRzZBm52P}mr;1PUEE2R+`ZiJpYq31@$X`q*7q37j*
zUWt!^UPMzAp_gD(USe+|QU<3WbyzAgM$+OEdf8(LzHtQKglyzZ&OzSvTt^+=tbB&*
z^zt3)q-N(Jb8Z$g7G)y;)B;poOgp}g>hv0_JKEct%~X7ws!+GN4ozE|(734)?OWT>
zOW+N=X)s3KI25Pee-h2@SA^RUg6s&Pclb}xQ<%g91fO6h@M-(|!{6i8zrBL1uDuGg
z=FT#9*8Ia~ELpagDr7y?*B4O7V<|n=;|Xl1o_eYoB52{Ip@LF!Qgwo(t<zHodTPi}
zou03+@3H+lS79xyp!Wq8%r-Ru^aD%vdj}8xNQ?WOvB2t=XU|tiEgt=uHl5Z~A-Iei
zKhd0P>c+SnF>*A{J^w<}M^1{)+n;vH0?+ila{x%-n?7egt?Z{}REFRuYpZD?-`jt{
z3?rL8UkxUDHob2*%wM?FG=wNE{bvM`IwpTkunB;dt-G4{a5b&5S8-jSQr%!nN4L4J
z3oY);e|>`>QJQ<33-BugG7G(el2RP`{dY`KXKw+W&#9JQHL84}RQ$BkckS8R4?P!x
zq6ja)_PWW=BJge9ev<=gjH{{SWP7Co)%Q<IaPI^6<D}D0#e^w>pMdUyyut$0TD*Pd
z4tztatWg}FeZikpsYk}=yoMB>g`OkSn4#8NrN!EBU}sgH6sfYtyuWY1f!;gsy-lS@
z=q+1h`sS5YWTLb@iNYiaxdlN;NO49QzjMVEfhegAGOf8z#ROhO94%j*4{B(`>SEk!
zA>GidmRf?RiwdPTg$gg;1)T|0N(4~{Vb_;NTS^f15^$q=(MkDUm`3$7v&aWivRyDa
z(+OjEY-+X>#?gk3X4%QPRCR>alma(QDe}X(TptW(nc=+F5URyNf+JPk*j#_K6HrZ5
zjXi|o2&%lnsjgItgdWx9uuLkyTwf|pFSJw9b@BXR`TiK3Lojo_FWbkV?s%>&ZxfZD
zZ78YZbz9<{9cnO%R-F(u4J@gCXvz#hRa%H?3C<z-%JVa+_{vSUmvW5)A?zw@n^0C=
zkHV5NB&VezFg(NoJpoUH>}79hZ8ZsyTPQ&nNrl%BJ-2@7x##smuh8=t^uik>5Z4)p
z#GV8P^oINaz05Hgju@MP?D1L1o|uKKshP-{P8iO}M$Ysc<j%-r<n}|4$Cl9!pPG--
z3rkRWWjU1|mFsnmdcC?$wWz&;(A(UMrj3p0+|q%*t-S=^FpRlz1Wwy~GIgF>Vt*#^
zjxvt$%i0e<LGPD?rk+nNwgR33-;sl+AD<dLOef|ac<j(Y(<5K4d8*4%$5cleA(o|r
z5WLi2;i!M(Ip2L}K&W~>+j^^dI+f8FTw8CaGU~sl^Hfd%)6xz>ldaXzvG1vn)NMtn
z;D7jmz&>=C$CS>9kmj{?O+AC|t?TI;vbU<UlTuPEt>C9VdJ1(}IrP&Jep$cemz|37
z;34e$_!HBh@Z*tROg)}*?kPU!`d8ooXkaIWCyT52?$A+wR=+U#*%GjVmsI1YpMQ-{
zloFm6`0Il|nc37n=lP!!fCqT517Ce@8ZD&Wl;2M+z$&Mi(KK{Tb$lOxs^d!Qu8hau
z^7B7pV5az-p*b{^W%<xA1UupX5wBzN@To5#x3#WRRl43+{7jUM`7?e#I`2~gN}-I*
zUs2f|`0{hAMn`L~EU=Q33W5qHE7el_zt+$~LQ#z(AG54f+}8x;w^XrT>)8m}j|j4T
znw5_60m1hEJ|!EV+Utj&>hTm(d8(Hb^xl^8d+%L>?`^8Re_@$Xdc_1^CW=erDN<tT
z&Js{s8A<i(i`*h_lvV|!rXiNli*!eMv<E5~H3XhoY1@*#2?Q@w$JWCy>);fEFTokZ
z_@x~|yE=-{8An??gdpo81jl4JVPY1cM$j#(^uvm3Kb%}2gvBN9SW)GTQyPP?tkMfh
zD!ee6WoHz)Vs1qUrc&MY5<Wu+KB>K#72%j(7D`3vjXv&cA*2RzIX2&qcGwMr2)N<A
z_Yf+-vBe=6n&V3gPIbo^n&)T0*URIBXqQ_F&^Cg!J;j|%cS6wBv@WYPxjBy3ny~9m
zayI8S^S!ka>~)Ers7@j<^J7p>yIYo>fXbq5)K)3omb$Ri6L<~BpqH4E0{_q;xcc%}
zUa(U}5iVV*@-+6=J&d+C$`c-Oo^a*os6KkG{m}Es7x)N0{xJe70}$2_j@XXae&|v0
z4NgJ|?QQCaRHThcL;4uTSORc-CNd^wAZ;@DPfbJSG%CP;=o!T~uK>A=sQ6aoBKOo>
z6rIOkzspNev927I*HobHhFa4~+qj9Ac3USp1iTv^z#D(_XiVHb25av-17G8NQ~z_Q
zAACO%e1{3W!?du1-av)-13@PhXu7og^c?~41HL`@t!cGYsGpZ$=TLd-d`yt=AHLuw
zKnXY&f&!7?r%)$k?GvC>&O%QalWm%Kfl8rc+P3=yyzi-!1UZG)f8>{2OV#PwYwA80
ztOlMurV$wGrt$;J3w%e8{&H-ez^4%qwy{FcI{4F3{QNuc)6c&V_&*zX$a)HTvbL(r
zldV-K@qm($9z05*Q*jDB2R{D_-+uoSt?B24Bti9^GAjRIW<S%23}t5di1$?;-sgNT
zpM3VEQF&i}=NPso@aY<YutsXAzVSQd^<z2J*D1H&*ZeF6GUeM-W@U}(Rc4k?SYFCf
zQ%efeN{cT5YO*1dn~$HT;HUEiKTSp?=zadJGF=luw5S$x>H{dnC(sFa+GnAt5g}4~
zQg?!%=IH&3_xqIZNl6eilujW9_nsgpWyj^fbwN)`Z$IO$ciwYA?}N7py?<i)ilru%
zRB=ffa`U6m)S8LLmNeAVC!)AK3^^1S*##abua1CIX`~B*=WhDr)kLfBoEi)~P)Fc2
zFq#>C1jy)gH%!j+#;g*5T2@z#lGT*uBs8St##6yfBlJ#f2*BBGAvm`^1ZOt*;j~&W
zoX7Q3>wIu>tq+!#xL|Gpqcjlnt0FOqws;W1H<WOjQx%5Ul_ACo59c*U(2Gwhq~*=^
zMmHffB*g`jih?mNKL8`My)Zh@2g7M?htS#%&+)_vmK&Vujy?jkld4cEP>mV}YRMkx
zO!q~5syE-C8@g$m+j*>>&{p@EHiEr1-3JY6KB&(OLQ_Eu>hcp%TbzM<{({uksY7!u
zfmes}ng*0sR2ik`7ZL~;AKC#yPfCyWR+9m_hY)Px1YWc!+y%WPH)!~oEA75puA2d#
z1A1NvsPIQ<T?nFEqY>8?hxp!jB=!+}Lz9p^JQ*n?laZ_#O$d%nMe2l9Bu`F3%9IqO
zPt7pU%bTgz+aeS#EJnf7LIb`0(;Z5$dL0!W?QQLKHE7z@gpQj!(S7qE483(2hTk#_
z<8B*+skcwW+&gDr-maOr<nfEB^9~YlzZ>xV^y@D;LilN3g~NoKz-JYnT6L|$QwVxe
zdMc~FU24HQN?@vQmO7>?luSU-6J!*Ep1>pU2}ar$5C^J0?GvB`MLXooju{oaRMsj!
zT}$0nRDUPPeIqy#L|O`bKm72cDX(kk8d};`-@`|a8nCIAR!c!o>dMh#%lAhhd`|$X
zW3bfTw?F)3p!Wk|r<PmIX?uw9`-c0a_@wlt@C3Xsxl}0AiXiqCuVbt%p{PE5Uw`)#
zw|^v5)IX193CGU~ul)qmex9QQ22zEe(<aL@>zcC6ADfQKKbW-k8vgbL%X~yAe$QtY
zl+BVbS5{>UJ|!dFci?lQLO&-|wJlH!dP+v>P^wClpt*VZIrE(TRMO^LLQT;7OxaPs
z`^KogeN=E3da}G$^?gj0r}{wKNbxDls0+;J{F}*6OSSF$_yFFQ!s`z~?w$9QXh2<8
z93dFmOfiDvd+&4q`|q;cKdJiufu&2Apr|Mx**S4YNee`IRXigWl{NehHzuN~Jr2!X
z@u+W2fRo0@sxz=U4p*tJF3yd>@HX8|#ubK8b@^a2<>AaCU(BW|Sy2~?CA4Fs2)yaE
zvvW(hpLTU-juRFYIpMT=Z=6x@fir8}aCU<y&Zu$4>D8`SSxg{Sx#O(H0IVqU#JoZ;
z%qjGxI&{Xwbcg*NLvT)|1)fEPIJ4Lt6M4-^xy~3zbvT&N8$}pSFABhn;s8^3IDzn(
zRUSaa=#4S7)MIj7F}cJK(+JLS#r_z{=kB3Gl<HG=nT}LXs#p&LxmH?RMFX#^kt7YY
z!reLk=*$g3O^PQPb3)Nko`TNWJTz7npsGSCrU<?|THG2XAE-q}b~b_{LjQoC>hol6
z2|cUy+y%Y(0np3lv3}@z@t4P&zd(MaJ_xB%y<P-jI${vl6NiMsRD44i!xE6dh#$@v
z!R=8=NF0}hq$z~qv=n4ar{yK|^3`o+PO<5foWF!>Z)F}z&MraSRkdi@*kt<OwQgyl
zy6ZsC&AsTmWhh47ITqt~jl+!HQ?YRGT&#L<8P+_w9M`{i9lpaMf=*xJ-}z<!jqAS<
zUPl=Ywf9qhR~8F9DLpNv_KyB`#B^1W!aH=BfTYdU_)RH2_0bxr@~pMA@R8!Os!!_z
zm|&+e0+qV6XuqtiwslO(Pr%Z)&K39$9n$<6QhsVQai}~^_oz@Z0()(hw|xWk4Bt@|
z32y3-Crc}PYd|+3z7_y^Ju_htLj(T>KPkO$`?DE4vZGM3(#{HcN-w4Ok|6w;@KRl!
z>3B@2z5C$-!bNIOkdYcB_%xp`Ro}-1;zv@8gpwu?-%n6|Mc92zJ1kf#r1&(-L(lLf
zA*%XD!A^EqYE*#!`13FDE#LFLPiV(~I&7X(_SjnP{j|mcup=BT!+IYoClDUsXDeVT
zG{vN#C)g>zR0rmRf2cbtzfZpS+=O5#wI`(~g(vXI?y8PZeHAVAtc6urb*Cvf74Ltb
zS<%#i`BR4~H2Vm~_ugT-e=`1Mpr`33GP7clo*9BT3Y(NPFJ$HfBDXLQRgK{oG$IpS
zgR|kJQ4nRJPACrIGQ^>LI%)N~X$dD#RZT7oz~o#{M?D?kG=nyFNtqAE5>hkry|An*
z1j{S^v6A+6O_du~l{(?1GAEqd;EMAaU2#=U5H4!*!D$svSX<+POFP4HX;%~h=!f}K
zhO-E|m8EVtz0McQdG4w8o;bfd6l-eTF~5N7j}Tl~=!ALs&X|?$gxNVRn3dy<3B3O_
zKFiWdZ!E3$#q>M^GvA5o(GLr2gE6}*5YsCHF|NP|eN?4gRDInkZd7*!9p7Ue)nx<M
zTWQ%F5}nQHl8!7d4B~rk%JfDp6<>R49C{n_&{kJ~x~f9dsGpwcu<U5f%_}TKWPFUH
zPEXMD>9@8)w6TnV&~xMGX`$yzI16|V=y|}m#2Z0XvbbT0YKuZlR}5l%V-Y(j7BPJ>
zh#3-#Xh!TX!f-U9H=fX&O6W~bMaGOw<j%@N!ECkOI-sXc$%Q8sq57g~wBOK%PJ&L2
z6#`!G_CaP|+ab3M$GE#EV(Ol$Sn%LHob=dAocrvVxbXS2ar;}hO6TEcs+Oa_!9edQ
z72nUl{%oLkl(0MU+hIrREg>hhclcL_>Qk=0gNGCcS^lW0&(olFr3JUW@qheL>Ps1O
zWplqh1}Z^Iee(n#Z3~RQ^1f1k*1lSGCkP6f*3t@kTB@ARRST`PwHAI-eyZ1#(o=a|
z-`?BWXW3gRJi*O&MpkHB^?b(WG6Xu=N!90RU8+d1aKP?cV;!ZSq{hBhy&l2zHOqd%
za}E%O`>4*O-jrhg+aIYe3B1n<yHCF2w&ug7>ieAgHUI5L1e>Ov6d={v`5Rhcflk+!
zk~Hw*_NVF=L!g?8j0iqGoB9q4dPc?ZGZDPL;Cqx3)iq3V0|HdaQC3%~Q3_S>L(mia
z^nT1bt+K4H=GOg$KVV<+Z>puWxl(@L{2<s0YL3PYsXh%UlM4Kl5ESsVRN8u3Z`)nQ
zHdqLHQhV<abk_1Z<{v&_TGa(T&0X@I`tQB_76JKB9{)EMFIt4mjC7=>MI$#i3T5SS
zNT$e%Pjp9Co<EAK{88T)iNPZa;M7V5(j4z%`o5`kvqg1X{DO8<9Suu$rIqy{@Z7MJ
z5Ls64jp?+V3u#XUoe2ciq7nkFF%%beN8;QTe*?Vpn>=uKtrJczal&;&LvU4hAkL_8
z#R`JyG%l}T*)>JZSWTFnRP2nC3A?p5?l`&J3FkC=Pz}1{^jc?})8dT_JA!atYarIr
zDxXqC$QIHX6P9xbz?odGuJXj0Eg@J|Mo0>dMXs1?6e1x=>o}Pza3t-xT6~B1H-hvM
zbgJi514c`#2ih~dP*0`Tp6P+X#X;!E@kJd!uqLX#uF7=uHkP2hp~56JP!|@Hk*2l|
z#U&+(Pf3Jlptl2hzJwlCo=dPBoK1EWw|;v|_)%e+I=xKV+Z;D|3wQ<Y4(L(osa`LH
z(2JnOjp~d=WJeSt`Qs4T!|07d^k9N;7;Wzu+T4k0NS&I7^cm^Mo|T2X*|{j1OKUs7
z7)6UpkiWDDrKeY*@rq`2Z|O$YRzh!6E0<jux_uZ%-ZC1aZy$&0_szuo2N&R^C)ePD
z7tX`Ae_M}@|GW;5ef9`wFhSNC=pFs-H~jqDuY}#t24p|~dW3+Z(v#{_mlm$e(jGcW
z1$gw3QFc;zN-UsUYd`;dlwc9Cr0N7Thl&$$1U@YVEHzw6wGC8n0+B5<a9c~O6zzV&
zOY2&y-cH~X;07+$h~d~K1gJIyPnFmCUkjjW+2wInunT}vernv%Tp2$~J#zgELR3o1
zLd7U6Lc!MM$r>u3o-Cu(p1@?_MjI@pr{&i?Pl`|Ak=pu_cJ{!Rd|!uuGuBp9KYpTp
zyvC<2qw+ug@;lY^Z^zW*CscrfsVuJQ4ppxwn=9L`=QA+obLsg6E_G&>&DAsN*#t{#
zzXd<7>v>gH;M3C4(4lTO-<bMGsYHSAV=B7O_;);@`a0TOHFA7Rn0>>)@c}AmDZc%*
zv<7xu7x=Uk^gbhamDlex0#DhTKmPQyKS9qnUJSInAJUqCOi+IK(Z^JH?-6?HuJUgK
zy!WZ}=FFan*cdvcm;hvCgdrm{2(fXlh)?i9dX^t@=q$^sgV5MUdpjiCA3X%XAgUns
zk5dck&=ePp&-FIIn?a>CGuIsp_~l$)>WRfvQ!7h6v8>b|6Ipgj5<x=%oz>)nvm3pP
znme<~1y}b4QssH$x?ushsy7H1G=*Yio-0<<0-r?4tt6lp^So0l+;M($AWkWF#@bpB
zoI*u-S{0A8oFI65wToF^*cM2I7-q^Yr2<@4>5e6(&REFjSxCSwAy60RIb#mh-&BHm
zMy@LsR0bORJ29V1mG*o>o)<<Eek0Y`p-~|5?r4rvV}}P?Q(VzTB{sOwk2cpAO|-ae
zSz&0(j>X{ma<nRyRRy6Zi%X?fS6#!XMp{M+f+B+){qqPslT{_m-6R@NcNMQ#FL+Yv
zc_w+mgCKP!aLq6@T3q*B7g}3i_*VqLy~qdtHKB-Ti$_31IKrv;B0HiT;Ny=(WN$R>
zZ4y$(Wgu;0CekO-)=tYp&WsG?&(1{g!h8a+5P1uWkh7#1m1j4i`KnHIZR|qNmQM7k
z58l>34Bs{iqqmR8q&p^K{sZ%I@{_0Fl9w;TwtwG*+upq$ckbJTCqH=tKM{Il9e#%9
zQvDf6erNo~pv^V)dxW1_bPxUf6TgTA;1Swef^N$;^(S*Ool!N2{l}jU;c&mAQhqku
z4^Q1tv`=BXnJ5PKxl>tfD@D8jr)(^??7(vBt2cc3aMMji*Ao;4Q4LSC$L)RX`7S!2
z&-oo;_a(2brJ$#lMOC(|(p~E|v~Bl&CAbnC1H%sf7jSHtNrVIu!Hu?2>hC9l>!_xI
zlmh$SVM}#PkolG{J9zY0{78kZbW*aknkz%Cw;G3g1XSIcH<!><mzGZnR~rsk5rVvy
z0BqHowYPdU13$t`+YTG<fU-T$hR(ODRRA9t280Cv0m4d4g+`4iKG8lsFJX3oVEcqX
zlokGnKQ0bCtKoN=XvvZ9@GJc=@H{E&k3apKu=@z_P}OOcxAzFQw+TKCJ2Rb;Y0G7e
z-~Ui5?>+2O=jHd_!aMK&1N%RE8#883B=CX|746ILLUM8d5|eysb%T+e6N2nqe|~qP
z(9xX%r@k~QAKJX$cqjBy<*AR|aH_b;c|Mq38i)nu!B|AV&ChqoLPBMAsRvH04#1KE
zsy-^c`Gm|V6&|>#Jpfm91>rOTZ8ZUO{g6-ty$hNMtY$A<+8Tok>Z5R4nVV67D+nva
z1%%i+^*&fz?S@mS63?u1rb?tWZt%eQjkLOqvua#$Sw{dt8o^}{&TjC+x$Qwzc>c7-
zKIZ<b3Z1cx@LQVif+a=nm`R{6s_?_&+91s5^Ukjh#O!iEOrg4(nCFQRnZD@auYmd?
zw$k1<Qt2tWGTjM3H=_a@ll;(-7LGx*w%zqrs8>g2GdiONwUwI9tp=rK#R%67Xn`IC
zo*SGrH%6p4++%#;7Uc<-2&%hicep2b!#ROAI7w4I`ofzamdbO_bBAZKFZ`-Q;aeRB
zpPDd)w#Fi;F$}@{p$KaWLv&9xVh79K#?#)?){aj{D&dzkB@5Zp(~vzg1^M#{KaIjz
zN{hRu0+r`BplMwPIyZKpiz=^Yb36Jrbz{WVQJAn}3TECp3u_)(O*?xLHomz9cfPwD
z_q=y6_U^kE&mMS|p!)^j7wi`T?<l`+M}Fu32|d}|BflvAfS$(K3VMeQ9~uBX7i`+H
z8Q)R`{6IA&EBWG!FB(`0KuT6AKnZ4cRK&oJ$qEa$S6_YAD6)ZxLv2|*QbVbtw7m4v
zOAYi?UVZuo!qJ8~m(S=Q7+@;a9oc{RWhd;uqV3i*4}_duw;|{m%XnM}EDC|f0X;_x
zuT@^Eb5nzdy0^$0+p<!OYW&cgxS9<7$6tTPm$cC8nEbIoLgo22ZTTVY|M8dK@cj`D
z|D%m&sEmfIDFh_}OG-}A)4F{odoP8a)z&=<VhRg6g@xYnz*7u_-T?uROA9@zK^@mp
zt-J!CmLCyxs=HGio<h+3n&rPH90fi>PhD*+^xoP35#E)e<MN&T`>~ITQT2ivxnZIA
z9@U=aGI{6Se-VIhW9sy=h=~hATznuB5`zf5AS5L#0+E{+iK^;klvl)}qAD6rBeQ&|
ze7rE2s%;dN&iHIEjLA?Vf)}P2_|YnQVs4=i=JCt9fJ*JOnn0Xd6@Z1@KRcD+DsaX5
z4gR>2FxoUE8fTR`<20(cOPf7$S(_(qEkRb{g0rb+E~p8|>9n(JE1Yp^u@laqy1TSB
z0O!{caCPoDx7rz(&>FAn^vC+H09@JbYqrm=p=GXjWm#{WPB5NP>4b|q{ctX~&uQ|;
zS@oV&jILNkb-1#GcDdRca|z!0++ITayR_CDi)*~GpxO(w%3Lvla2(C|sUdReWZh24
zw-VrjUR$aQnv-2o7we3MByTjN2BEVw6+>F9&{XTNx5}?q$Ec}PUcNfSrX?dJITpT&
z5d>TS!AAgw(9TBt!zIcW&e1;b%Zfroc`AZR6X1~(#Gvh^+H=jN^3xdHs!;e;g~6*T
z1irPQRC>{f=!ir_XE-fxIHC#sm|;<f9~Fn>@u`Ge0V5Atw79u*(@?T32PLcWQLwTA
zd20$$ab7hVu4+W<4UOp7(ny=zOz^c~*tVgVv~w~R-n9VdK6wsqczq-8c;`+$xbI;+
z_#Wef2l4#J&l7h1_EB;C&i_%{8T1{c+B^E|FD3-N!^Y+ggx()YPqFo;8*%vP5d%Ge
zNQ%uuYT?3#228S*17Wv)`*x$)HgDc+5&&pun8xL*+(77QUW}V=y2+H$y4|mkZPkbi
zb#B(R6`B)8lLYB{y1rEC`fIMnPe%?L$XU<{dRh(ypWU{h?iPZ`@z5I>f{QIP5H?mB
z>X>Z^5%i?sGzFxfckt(5@da(TfF|I5t}Z8p-gk$O8sI5R%P-65m;k6z8y0pJcm`sE
zRX=ofU%^(-VWDSXW+7*xHxP1m-EIqdpYpl%YzBM;m>{TQ1EFVQAoR49wf%}f{F2*W
ze6BS00^uLfdt3IF&{MpB-~g@nr=}4^^Jcui?|ox!KTw??L;p73d;e{WpD+p$k%5F>
z7*$>*;^KmlloUn)1|vBo1o`<f$jgsJPHqUCG}>WIwl{jIY{sO!ViIlb1lq5$1mAcC
z?cnqr56sGT!+e5oHPzCpQXeWRFU(J;6(vM2qAFY8=7X($p}3gfx|D!AhhRFdR$!x@
zr5#;LK%PR_oK2N?0oB~4RBM;f5?@5vT`E;4kP&VdRy*P94lisO8iGxOgK&M1AFgb5
zXIWQVNr;|CkeyBYdu~0i&85PqLc;Hi8mdSF^<>)X)9Ssjy4n>d6NslY`(h>Ux3bY2
z%eb6T;(`eUUKpD0hE4)+NS+6V5$v5Au4t#qYoXn5PxC=rh96q80?=9*h2iaGXsOfO
z8OqeGbn<nmtRVF28<CQii}?H;B$pQ=A}<5Jsc~?M2!vO14E!?_5L%Fq?9O`BO&X4h
z@qNe~R1M#vFnHwo(b`79yDS_|**@^9jHI$lg;#Yr;TME}#!!T`(du@EBdU+U8>u?I
zM8uCtLBhCfq)#g(@Cs0{C?CZuXm1I<qLWKdadtH-E~rM`RrP4Tp#h!SnlX4sCx&dN
z(%Uf{Q*NDtRrjpGmCsy(9e>}6z3<+ON8W!754}a`z55_u`0xe%_B&;VCUpJ%SK?2)
zPkqq{KH6H<?H&0=gX`6Ij|xP<V`$pIE!#F5;JJHx7_bN!zfc{jf#Q`{l=sa6CM5wC
z^b}T&X-JrqpUSG9?H5{U*;JJgGzCgqRsdDKpRJG6G@tewf}ND5l%Bnx?6j_J>;D8|
zb)%5N`+_!1Fw?q}orT+fgWgvJk_DU%3p5M0f$J8010f{4Y3I3+-4)EfRsKA+3=?F6
zik8||%6NfJ2}K1zZQFesv7u1ogzYLL_*m#!OKZE<NO1~!w*JwSV_2}+bqg_tJ!YWB
z=MrQETmwF-K89eYP<^IAH!!TqGteW%)TKzF<PS=N_2EZq%;;Bo{os4=;{&EMwXCrj
zwLwcO=)M2J2l#N`$Jqbj0fKKI72jJJJ!UvULj7oMgAp4SYOHRUQq08$5qe=%eu2o!
zi9l(20-VNB@k}Z3$8g%($y9f9%R;cAGJ@;gn3_w4R~n4j1-_V(<%ZeWu9%zUf~B-&
z%c$^{<`7g(E?C#*j4i!h*xc!c^$kwAuH6~ebh+Y!Iwzb%kgcJOT}2x#;9b||jdcw!
zxN%50w)BPJTEcPzRoj(>*d+wmMTFlqEl#*)R48s89flnvg0XE#05;MpZyOSbE1R5g
zb(=dbB2dp|+4H%)qQjTgIuPg6MxRdTo~r$=emIjbJg?0U=XC~Pb)74g*ScUe;kc-l
zDz?N2V+p&Ve6F#j-WXRAh(Wnt=%C$KN9C?ee{^K|p`BoADGVa?^3c^(g4(J|R9Drb
zs<IZ<8kSbygq*T+#1VYi^|i=qtVd>3J#yMwklxgQl!jVlwznX!uLqgEok;6xNA}PT
zq;^%nw>TC~St0N!i9<wNAxdZTp?=XQWRGt^SVua7TN4n{9*5{bNr)YqLZz3C*pVrS
zADfNr*$qgWUW25m1xTBnkNo8&C^@+TrKi`R{G57JU($@abuDPR-qB5E=xu{B`p%J<
zaMuLP**ynmK5!;By|58?{p)T#@XmvH<gG{X;6EP3ga3X2FMjZXv9^?VT>eV6ca(aM
zYVY?W1fGIcSI|?3<)07Zr=wJR$3aiAaq}hv7j;EZ_YkFp60`)S#fujkdn>3}Fey}4
zP!iB|Tn!EuZUg7ac<H5=jH1(im9t3+WR*1&np%PdJ*hm^<5}n(YvCohG$iVWL+W>T
z&{$jnPs5_L&%(~y+do53`&Ir#A#ge%C{^{Jpr@svr<PW;%`#GKf3mGy%H~?g2{ck+
z0+xlIASmDoU>15>*Y%9*(>m{^XE}8ASJSOVjV~5@7I+qP7J37LcOvNNo+pBy>h#p$
zA<(H`pTa^<z%$cPQt5p}pgDrJRekU5=nJd%_7QL&6KpC^@a_NTBYa5c?c4VOLH7aP
ze~;@Qd}NfKpf_stFa!nr5PXa<Z(7}8_y@WnJkl4b>5)iGq4iDlMP^PIoMdUoWx8Q_
zyrZw&RDSVi7YATYNf2fi1!8__2xjK_U~aw-78UtoIu+Z39A_*fEKZ_Dy>5^@ZW``^
zJ4gFtdyg}=b<x)JI$_tOP+Z&NK_L0yY(a<$?wU518xw&Yeg4?e<%caj0oXn`1UFOB
z-PG%gb%f-m4g#^)1G`2CWB2%Q+%_^0H}-j9V}~oR>rJ?N<JwjiT+{4KfV$&`9)D~c
z5{mUbfmDBP#`>N~c&dKzG^)%CTD%EBUo0hX=Muh)xNmW_C#DkG<8ob0rj`-hKdOXk
zH(PUkP~oNeqAAe>EmXnn`GjDR4~8_;;&zmwxlXpWfzYc%b#0^Rs9aLlfTY4=WYg+q
zH#8!tss?eizKON<NNH+8YHJ7Ln_3Y{OB})D>3xGxGIkh}x*FhDnUC1+MzpM$hvDa*
zjA7@l!I+CrLBsM%NEuU)ptekeb!8!ZP!1vn=OANh4eC!GjgjlupzFegs62TD3YN8@
zXk{}>PHI8f=^dy!e-PTP9*xe8W6`x`6gsyK!;o7>V$7Z6G4;L~Sp48pT=>L=*#6g>
zaL>Q*!ToPPK$Z6h9{9%tc<`<J@xuGh{TX_M-EWLvem{((zcC0t0Z*NdH6gSed2u}S
zZlYDS&=SlvsgH%58YKiDDKcB9XQ8K-T`i^Vw61aj+_GiMjwKY(m|Y#y`2v|#puL7*
zCv~VgL~ZN(N?IV5r}FCZ@(I640@%SH`Q;_(1U^}D?Xwp5KS9rI6G-aPqOfCi9YJNr
z5cs}Pw-<&0DDVh=7Jvesg`bWKW~!GHP%Q9dUj;S`JwZz4G-Bc#!SE;kKpf`#=eh%W
z>J05zKeo-Xx~J};PRa`Ha|}J}ulHkE&{^nNz?}$svbQ!F!twU@M9?$!dxV~Vr|v0=
zeYCFXd#A9#GsDif?EoL`tDq?0$>OSRk0AU|Do@aR{{yqFc{kqw;4O?9JCu-fXSgz4
z2tH3lM*ATmifS*xA1Ub}h)eQ8RGb@}CT4jMbe@===ZQ%fgdT0|%zSUmEh6{|{V|bH
zoS5p2#ic=5Ru+iaRC;Q>SeEC6i(6c=Ww<-;80U$5#`xea!f)?5AKX6L1NTgi!1ghr
zxU$2KYRnZ^w76i~V3rvbg1bkC;2J8s4J~fiPT1W%$Okv~dSY{%6Ydz{i=BhraoaF2
z+&v}$w+y9%>>ym53Da&ub8G~zZ*gW>S99*p;bDYs1g`E1z<E@2YYDuwr1l8pGYP(n
zXlXBM@yF^iXDlGxmXx_+S+$=T3O9u@j23-pHo-^r+D$v(mh6kx<N(yhxS%Q34ITMz
zgq{aFt0FP7yBwVj4QQxpMol%rC*V=>RW!69n~+N_uS7c4UvPFlg7b<HU(<y2_FklS
z^dY&u53$WXh;8jfO3zS~Oqh)78M9D2bq303%|+w#Rj6FF1Pv=!Vam1ZG35NUD491I
z38T7@I=&ZqvqqqN@kDf=w+vIaUWQpaug8L2TQK*wt1<fe)6sa=bX1%&1|65q#kAYb
z!K7Qy#H@SI$Bg^eV$2<jF>L2_jJ{(UrrbLlOCMc{i~n*7ZhGS;-1YWdxaaMAaPM39
zVeh~8V$Z*K<LP&wB<M&zL66q-_ur`WexuU+)d9UDT>fv+bMMbyqOh4vY+80HFV)Wp
zY66<wr=_j)vmlhJlbZYOx8MGhEm=xWBQtaj0aDXC+PXm<(>a2ly|=<1*F9vvm110t
z7e{_RdaNNskW-7Zg_{j+OARW_F@E6(0?#HFHS7JM`DQKjEO-V&&w|jXK^{BSAffuX
zeoHI6YgApoinHob(E3qbeE9b{NRX@Ju`;N9e~7C12STkszL5%)V&ru-9L;W9=vj~s
zRCfbapWSD}*jED1wAym{QGX+bwsqV>PoO&<dKx#Yj>u*N1_7sGWeSJFBlP(9RA}8X
z&h`^yg&ivIL+mFA9YOegNcesDzG=kxVBfzmc6=WKf}9Z+=|M#ofbb|!BqRqSI^G9i
z(QZgg^GAAaFw$}Y2)ztv%%II%T1jP>=Z@)wnqpce;a2RAF$qo>pGbAY^#xQt^K)Ej
zZC!9yttYmPj>OHQys>Mn7j7TyhI>c*;jtOvxOZ|8c2AAM&T-K=r_vRt6F^tB5^%!;
zu&vV_w+`~7D)b@Pg0QoXAf%ex-tB_BM)+Xw*Z}P4al-Yq$v06oZfPg*8W~O89^{3`
z=Vsu>Zi0{yyt&K6T;t|pA-JB3?~*2WoKLl=hL3ZpEYGfx{dJ)Q_raMpURYi3fmId0
zSWy*#rHn;2{+L|kgfRt-Vk*5%FSI21p)t-M^|794PNnk8cSQ@|Z&Q&UMt7B>x3LK=
zH7%&AszYsEBPwefP}$Io+=^<%<`kf?xg7~*)d(*vM|f!s;_KUyGiWr5M^8iUut~JG
z6Hzp7Cd#JHN8W^4NF6x^DWj(&b27JQFGt0))6jAD#Tb3nwHR{oWf*+%MHs#AN=)6f
z0kdx0f|=X5Vmf2;wk?=_>kfkNX3V*D6DDoB0^>JcjQP86z}jbaW6k5Y;=I@H!`ZLj
zg+-5DiSfHu5Poy1^yXpd<129fa~ELK-#20ByE`3SRo=N1cf7R=xBvS#JVfQE1fdps
zzy0<zq37tIXMp#AqVxzCJA;|xSXYsL@F+EuRcgnVu`RI{fC8f|tSqV^sB4^c)>)<@
zLk$xvR;>6_d5zkT(zD7^0JZnEb%J`vnX_jZiz|>x-3ffMwgc<+Y*^sgP+vZU`t8|p
z>~p|S5VX(}qy#(*Jps?c&xQevfG0o^OoE-Q<Fjy*CAFQQzo*Jm#uWXD4)X`$2X*Q`
zauh!@4oU$Mes*c$r_lO$KN9K#pr<$<dM8qQ3JX1fPQcs$#h3VqF%Wu6Sn$6=Pf)YV
z6G87o{tXZ4T#e!o^gh^UfcG(v33~hYQ{{d5uIZ!q-Uojt^adH=rDcYryeb_f<tenZ
zfryOrL`1ARLL!|ImEelxOkX(7F7(0ze&N^DhG1%n6DB7)VLH{%!a}OOTzAYO09BW_
zP+(-)m9&AUlzHLoY7bn~9f0*MRA(doaWi4|09DxIGvctV+ZCI7sNhB=;OufQoI+K2
zc_XbZfpmR6?J(8fHMLwP%=V58!vm8du#IrLt<M<`6PR}nb-`9%_x8cgxOaR2wo>VB
z>2#(gcBAEuqQdjSZM4<9sS0<Epsl8Y+)Q|0(dvX7hWOz+ZeL6&UQAG4Kt-uKzf0PK
zu)4$*OQ_tI6#HOafg9$PdSFh4J0_R7VPaVThVoaSA=(So5#FedbVsc+%jCPGvzYKJ
z3qn_QD#rG7qqV9LRizbZY~)gt2+_utP~oK%lpwvl8o5nvh%Ku@bVWVl8#<BFJ`{Nb
zUe)9UsGG4I4RcqcV&-yWPn?Iuku#7ub}q7Kao?)*(X@6Q+Ap{ceV1=S&n4Gk*t*Ra
ze8mPXH(}_yjhMQ97v|n^FDBo3D<<Ex6H{;8jwxHWV$!CKm~qoKEZTKDmfU?8&Uo@M
zEPLQ?ocfo&xZutEF=Ov}n0VJx%y?h{7C*iir$2WZ&VS{6T=Lo_SpWB{am^do;HsCe
z!ue00j~idU5r^;-e*5KjGVd4sLg*=MBZfkCc)$Msi-Db5Z3REgeJbc_0&jsuc2upf
z%2TIS*|X0++W{Pg04C_k`VO?WYK;}ll=eyMYH_u7d#VRi8G*~5Gw@u6(nCr8$;!U=
z+H1$kbai!^=TPXn>Q1wJ&pifmQg~8#vby?`X`jIMpMa-*rX%xlVL@mWpE@xsPo0$2
zK=|3ERe##H&{4yLx}_*&YmWhsDo=HBN+j^jcRv`QX{5}rgr3@)4*&A20p1U^yNW|c
z)iuVkZQ*BN*AKw&q#_42W+(<)T?;(hIB`7m1Uf-ap>3o72t3n>L8YhRZnhCajS&Le
zF>7nU_X&abXZTt5=jfB?7^^Gb$=ZHI*a>(xRKI7`-u`z@KfMv7x{Z}h&x%BITM34Z
zY({lmHj+|<XlbeXQhbq?6^IlAy*xK8D)z)VZP7TjHV_MPoG^p%TT$wb`MCsMrju#7
zP&bv6xNl9dJ1wLa&M0xhra{5jJ~V)8D-aJ(jK?kAK7^MG&L!losB^~Ubw0ST${$xX
zhhtN37`FC?U|UBp*3*Js!t>WtjqT`l#ht^wa3i57^|pJ23!a-Bi5KR_;Muv+cxZAk
z?i}fjTZVYy657?PsQ_;v+-~W0!o6d>4HR$Z@f(M@V>8v}?wL_|U~v*Qjq<>C!)U+9
z1>pJ-fw;Kc$I;=LN^dbiwnlvoE4(nL)D`3Sp2rsYpgYwQ_0gKt$R9O?pC&+R;txP`
zHkDqU51I<1F`}grg9x?y%1Sgg)H@~$@1KM`ySNM)02D#%zGaojZD>VWeJf(Cnh;UR
zsBS|-+fd{VpNjH{3s5s_C2|P7v<Y*OHGL_{mY;>X)2~3?+V!YDa~-O=Y`$Ov>dsz|
zrt_~w<9XMh?cy8IeBlNR-*5+}-~JFL+_VeRcHV_4H{Fg=*KWqJE3d`mO*df{x2JC2
ziPiT#hKpW)4Hv%p0@lC#0@ggc2@~&JhWU>zCitdd;p6kM^r@v-^2Aase_{oeJ-i%C
z?pum^ch19wk6eiF@dH8kD}Mg%7Y217m!{QL-Bkp_-!&{vV{wldAZlGBF*a{jreX(}
z1Q(665VWM$WIL_WTC`}9fs<fn>*=J_1VL+AZR@VqWmnZ$VXv*FRfK|?Lh8?!6;Pje
z;)!GSR@PY8(sSsZ->{5@*w1`FwgE#{*MjapLXXdPJn(D`1fNn+eaUq}&qB_E&e~Lk
z)-7xVA|)-5f>Yl;wdx9dDkI=2zWtu>gG%l2&-yz6M}Pa>K<_7R3w8>-emwXbA>cXe
zvaW4I&tPF^mjc;|pl6kywYL^}8n!0r33SIp&xQkZe}<mI0X;`nWDC5H20-s4Dm_Q+
z$AOOsPp+#@Z{LUSVC=YIrqLoUAqWLU$!KjaM{Pqsa`Iyl8RLn-5GRC2IMMp@-`bkm
zV5}jS);5IT{MK-sRZl>4L~$8}lgd1?EZ+&IS5i6g*g3VnIIqSVn})>TwsG;evCkiy
zI@}!4o1B142o=HX@=7OM$~Y~{31{Rw;lgrPT-)k}ox>xrV{jxkwR+?7awl3`C)_t7
z052>`z-ud#asPM^Dn=*VJ=_UTP7lQ6Qv<P+Wo{cnn2im><_>35w|6ZS;-0adyr(zz
zjPt-<V?3~JkP|k@4)fgY6P>Yr8i6;(8FwxY$5vkFl3opM<MVY!U`3%TR#Aa2EBB<@
zbH&smS4=4JM^Bm;p-0$7`=KQ{7>$YEXh?QOeX0{`Q{7RK?1!%M0t|0!MN@4RYN{(x
zTT^YoS4qW}Us_JJSB$Kh24pq1Bd)p$q2&#TsBK4L`!Hk;nS|o8b7*ZBA$RILWKN%l
zf<>!QbNU6SJL?J*pK=ijPPrImXI+J+%QmBdpli5z1Dd(sb@dLkTyY}?Z@ddLc0Yxw
zx8I8?gx+MrZ~Uem7<1iLOxtn`7TmfAi*COUi+4SUbN})x*3#l#_4Z3R=Z)Ji;^sM+
zbk8JA*fSE7ADW2C4^P3kz2h-<_c)BceJn=p7>$uPj=-9G)__RBQ7Sq0&2xZH!_a;u
z@EBbG>F8l&XQl9LQUOisymRNx1~Lk%Dgj1aPkyHTdiCX3;NnC;@tEwb0QEa9nk}zT
zcNK+>30yi~&=d>>N?T3<)v*gNywF%zK`%c)-!xL_T*1vMJ%!Fyi>|$<T9pMfTYo1@
zE2XD(DZgWY`w!65P`dAAg9*<QflqaQwr)?|TdbY6&@(Nl{dIK5L(h)d&{ADYWNE+s
zUe9SmEzVNd{1GA;6^E$|kC@~T0-$4E%R@(hA^3jgb!2@7#9s)+UyahU(9=D&Zq=q$
zeim{kf}X<KTS3o6KkyW?wf_xzcFtbGPg$TH`#&-XL<KzM-LopsLQnQqz}vs?V{U(r
z_unHJO}CW~Flq8QL_`K5HZ}xlX)#Dii9~!tFe0h`BBQ*Knih(J!g$kP&uK-O4^~jU
zomA$5a~k}ys(_Fycf)!0ehh!Jys9e#8;8W<@@79=(d>&mCnVwNMOnCaQaB;zO(2Ei
z=0V=LnpX09LP@Q#Tf2O58NqiUA$Sh2c{M?`ZIB<<*Sio}E>wI@cw$xro?8%ur)GuV
zxp`rDWD)^3)CrGFbi<wzPIzpx4<4N0OBESJ$T4Vz1=t;fy|AN?s&I@m?w{n1$9Vqk
z@$T3)-WhjIcg2G%1M$#l!FcqXC_H^hI(DBFi)+UD;1WLL6;zyRgjq}&pG=4^tM<m?
zx*$xZy&aP6hx!-~syuJBr-h*{Ezk@{Yb5aMQ#B8;FPig`FubJ}9Zj^g6`EJK(txk3
zx&}q%l}OGjL>et_W@8)TYg-Ur-Hf=FK}hc#i|pZ(Q9OPQiYL!U@!Z8IT(le|D^Eq~
z$!8;H#hFN6bSg4epNDcPyXJK_q3s%iZvA$&U%L}Sw(UXhrn}L%buT90{UjDY`aBjr
z_%s&meH^n1xhdOj!}J?(!@}G4VCkI?V9A{iVc9(oW6dLvVeN~L;>=fX!LaSKF<glT
z_72DJJwq{!%f7q%(6?(adUg&%_qHDNZ0g3+yO!W<eB<bnd<=SyqyvWuJVEa_{BEEp
zg(s7B`0ybD<p*rrw#C?6sj-`HzS#gvz&k>v@zM(~9)sSJB})vTlu$rW61c27v^G>W
zS9VpE*n)}dt;$Ims*%Fl*byT}m}_YN^5x4-Ssk;jw~9f72ATT5E3drLSm*0+*nrJj
zw-~@#*h%TxzIw-lZeV=-9e)xG^u9g@J{!m2N6^{UU4id-m8U*)f|=H>y%lI=H>K`O
zzde>wIRWzf9|&?OJwi|M!;iGEKmCluNBG$v75IMV--rs4p%hmdGi_3XafiC9*zPL!
z*pJ+{_EsV2THpzU_IU<E&w^0(cPE0L1)hn1;0b=Rx_<`V0O(P%S?Fnet%02d-v5A}
zBR=|&*L(jny#MYe_~gKsrX_dc#Bm4<3qVXvi0Pc1lo*D@gb>8V1|W*ii;3|@LV_Rn
z(ca3=t}1cI*#z8`gxlImSDaVtfvda1OkCFFYe08xX8<m%bHTbsSKK{563;J5phAnm
zJ!1lJ_n07Tpyk}s?Sz}D%61I)z`YZqam!$T+%+<Ypz^_;EO!UXT}69(B^8%c-jg#!
z@cjI6JU!DNFE0$m>r2D&^87%&!gz9uJD!;0g%{>U;4gEcanERfJUTs|R+kpJo9c3e
zJN8WUp;Gjt#r476qh0XG9F|$;kH=37#Iu)0<KMRw;mw;%@WkaA*fKW|>qq<H{ALd<
z=X+jG1$!Y?-x|K})vXa2m+y%Y`Tl5%cSj>Xn1*;?w59~1HPs(&83Aa`3PMw6Fq(6s
z&{toG-u4DmSCykqlcm?v;%eTEnmVKvmLR#b5;?7%NN(;xcug~6T6&PwGYV-#C!mOy
zws`6S<j-6{=&eB6>N5zvbC9#@EM%-a8`)HP#pkX^-4$EWzTsB1T)P9U*WH4_H|@oU
z+aAV{oeyH<?T=vU-lwqqsaJ8*UtY)D-H%|>O}Apst-G-3&bu*d$E{enYY!Ihz7MM&
zc@#?@x)Tc@xekN2&BV}KN1*rC4s_ktfzCTR(Xz7zEjwD!xTOgV*EgW?>ITfcZ4SO7
z_73-t!uUh!{X*!`-ZFk5;FKHhH`VJAc#7X?c?CTS8rA3N_0y01qEh|5{@2%zK~G&s
z^pYYwzLCK;NN8J)8@7AOK&AKBzy8(0MV9rgx85@8=Cxnfy5o*J%>IERKoqjN0;5re
z{gYkG&I)9LpF#~AwsGS>g`oGn0iU4vKf!0<N9fs%%j&;pVJCGb*ctV90;Q*=pd%1T
zec7_IwSu1FAOUy`c#NO=!AB8iLI4!p20~9uZ3}oxcxtV)1)c?(o<aB3eyjW}@GJ<`
z;3Dt|c*jG}WO43?o`7dm-hY6efTuY<6pmbZ0~8*^T3bhO{X>OA-F>)^*Zn}y`@#%K
zn>2AO!o&RuIDf>(_#-)qz>A^E3wJ|wq&wl~fusa4q$T^nX%Q9A$|4t>TIzyxYCLd$
ztvl9t1meaKakybn7_MmY#FdTixUS6$>j*5>sogrn1JBG3z-!B*aW7%Csl^GKXiK-!
zV(uE_i>K$O;E5T@xV_H@4^0Th1GKQW40gdS!@aS&#{<{5InkO1;Qn#0xQB}F{xME?
zc9s`jUmT3r7X{$e`M!8!k`o>p>x8GL`{S`Gfw*h9C!U&}ggb@?;Cv~^CW3RY8=u7;
zH>jSEO7elJUiiyNp?L1R2)uYrEDk(Whj;c?<E3rI*tsH{YR?aAYpC#O!<FIX;;wM4
zrd>a&H4+nxyfC&j2%YKP1f3u1V!aIXI?_YXnH7fi>`=7jgrY8;5G+kXZ)ZJf>#C^o
zXmhFbs%tb!cpb9KDiKfcWfOWCor4hF)Q;GWKE!tqM?&vt<c^(*obj`fI({b7r!Pd#
zg4HNmeGbabybPshtwZtISEGPYcHsszQQ_57>D5W~ZQO;CyB^1|+aJS_TOLN=cG}xL
zPZNBvW7(rGVBX#LVg6lr6LfcB;azuN*}eA}mAB&IM|k`m%zJPH8rDxh=Z!<ru&EBM
zH#ZY_&8WMv7PVWcTCcA{`Bmkpx}pLzcT6YrzB&$ijyk==ztK`Ley6_sjr(M2l@(dQ
zQ|G>o8#fRt&IVReR^Rgr^5elDG@cF5KJyoNx_cPtsh^%~udJ-B=|IqFY3uoPOk;0l
zSv877<#bFy(>_~Y_}F8Q9V;&YzWL^xri{Jbz^*GM)WXc?wD#2wN0Z9a2o2j|*~Y+a
z3%&1^Mdn!iXXvRF_uC&18O3Imn}wYXfljbdNP!)TelW_`3V1*Ma0EXcascm$KqtWU
zFMlTR6oMW(_dk@Lg`YwyQqUU+I14sG@OT*7V+MrBLGPH|?T4PRxz^SScnsTsA>cWd
z2M9Z*igJv`5cC|Kl^u0?>W(Dn*)Y&kPCb>|uOKL8b9wB858lDdnG<MXBWP=b5g+e^
zm}oZxxr{V_WTg5cKE?^r5l+bF_uEM}Z#AKJVWU5;=?=rqW0G*|_++ea^TzsCPi*M)
z#qDEauxC;{b`0^yMgs08TGRVSI^l1NBk|~jNL)*h>=^2W`=&?Y!5Oi*XKWZAo)CvS
zy19?Ax@V}fX{5NGpjywRMqE6<C>Bpl^P+`y#Y^-3@V8~bczK>Do}cNAzs~o@)03U>
z;%r~MI5!y2Ob^DLgI#gkAP?;9@xmpgPPm|q%Cg5B4^1Tq2m4_&p?4FN=KXWM@X|%G
z`1?(n`1rAUyz^iUUcI#(cb%DtYsUNIEUHX(p*dByyUEWaK3!fPh`Dtk7+)TQLAkzY
zOV{M$zGz7fL}x}gy0fFuEUQb!SEW>3#W5JvRgZ?IIx{S-y1ELLRW*cOJqj7I`9(;o
zYeYsjVb{@v_@1GN=^lpYu3^X+ITgvnryz0k48)C{gOuqjP`KheRIXi*%5yiM^qdV;
zdN&Yyo6vmi?F8SgXt?GUG+loOM(lc$@f7-YK8oSDJ%urMJcEUgzK+vh{5MuU_6%m-
zaXaSh+KH79-h<PhegbDc`z%&H_5@Zu{s2}yb1OPF&O`OpU8vd6fGR?+`i3f$T~~^V
z>&j7bRS60&DMabT#k9(k4D^nep=W}g1AJ<|J^Y&z4;(etR(4k46Y#XIIC}IjVfM2D
znxOP6zlFd3LMz2D$<f0{%(|4Hx~iydO=Z;Bp>^#O1a+>q_0lK`flW%!*6m5*X<g+N
zg3Pf73Leuw9hchEwk)(w6sqfMsX9F=Isr~<PfGz!@RQvY<OYK7e}|p}e6rOFbzU~W
zBP0YpB^_0w0oBn7dbSbcSYrm^V=b+X<Dn;oXQB7gVg5b(L8ql%=f~?%eErZ<2zI)+
zvDCbWt}DpebqhBw?fO8aH_+Z12oidNoF*a{^aMbKt<$s66YSIl<%0u`mRqa#%=&(*
zywC8_0m4y@5&{~Jeax}~-p5pYia)^D4?Gi8gj9bYe)ItW`7W>XZ_JrH2E`?DC@PNP
zk6svZa{Q5&>W+jsCnPd5Qk_v;;DhG+C^(&7?TfRjy>L}~Fs|(i#?7PRv2jo+u4?wg
z<qfXb*yV?NCMDp(>4~^&R1oeQ?o0LNhZpDg;rUs9*gM=0J3770u_xvx;o+$<*xBWR
zom6vs396?i`r)}5VR&X{tm%(;U8@VW_PXPxWeIqgz}q|82`><G@17omH<tO~si{tQ
zW||ZJy)qncEDXT|gzHmN0<d?47p|$Ky6beMH75AVop42!6Yd@viiajfP&N8vdzTY#
z9KwBbs1DX8<Bcs@cx7uYp1n2?drwcp2CB}BTRqHp;PaZ4p~V+7vYjxu)C&t6qcEn(
zAA@uK(595~DgLy%{#1RT=*o^neX=sH_@X-54V@Kn7}8aXj<yCWy-H(kE2|xo29-Co
zAU>}U@#U4s=;%glODCe+dJ#s88{XQ7%n_53GIWyZsuDkXHr3url&`)B)oZUt=^5)$
zbmlcEJAV@`?Jf+t`5_Fw<q;~oN6@qV0Zh98WlVbTRgAv-Ia=IjF!qkWV8O$$<K*Z5
ziRF(zgV}fAh57f~j@6Iei<6&t6sJD*B-T9fB&P1#j>S)G$GF|6Bj@}Y<X={TwDU5M
zbx{^FFU~~%<$1`xC<j?*XCU{iER4BsB+c#D26(>^YU;M4Fs-$}5ljrzVoO*kjt~?&
z{+lfDQ63}oEXYViqx3BB4*l?>spAv8)QDko=m|!Glh*CFV5X&Qe6YY0)U;1P6Eqb%
zZp&EPtFkJmdOur6KoTIo=P{*|`ccPKo=e47EUTrC+cFBfZW=KR*ev+|90P$j5PE7c
zweXSZQ)7g>o_x(?vc4K+p%&Hee>!4DT4*$d8F``oQi+7$4+PW?S~3n1U`Kepp9wX^
zQK~m3ef@>@RWYzpLyZ?|$dFBTz>N14^!i77eD=-v_~a{ETRx9TcV*H`sVmF(1fHNL
zi!0zs4RRmX4-ka=HOhj}Gowftw!4aBIRJVVbp6oN92WxJ#{}NT2dMN2JcXdAK9&Qa
z=U5l;wA3+&8r--41AP4HK3?k`UgO`GF=H4?3B9U{cr?~0qPij+_0<t*tc^rNO*oor
z!_nCihhcpgaJsN50PDL#a9M)~whoTK=H4)D><PnV^<KER#tj>~Xmtk%;h`CccyLMt
zK^216mM7xPHPLu+aR}}k>x+jcMdIn133z2;27wigd-@2jDNcB2br@cp;m>udxP@uB
zcYHY2q7UvGAB;z5hv4t0CF8jz{&;S&H(p;8j8~TX;GZW4;-hoo@ZlLL_~+7C>?OqR
z9qx=fdYrM9D)9~~%I6nl;-+>_(>G9ES9Xu0DqI|f$7lIq&je@eo#KX9&q%~8XQkkg
z6_L1mUKn;vjlkty9t>ZcS?7*bRbH4^tbBpWDC2=~d77Zy-;pXmJ%}nV0L`?wT{%(c
z%#A`zRtOrY^y(>(oAN@@Tc3-uLp%84mYFfQwGHhkt7&F5qKH;EHa7=(ZS9DwZz1@4
z5Y^I$xb|U4?i`8Ko-s%pHH{W`4svI$K*`cGQLyYRWG_7%g{NMInu|B1VciZ?U$Gs%
z+aEOQu7}IsosVGT?q@ND;G4GhMa;hMC9Hh<ADDaJGgwOKP1|)RPJ8w#tbF8t%)9GO
z%)0$H47=_cjJfG*EO_(=48MITaxN-I-sJ_zAn;PpPebC_2}ok3tW847sfkEl6^{{D
z4F$i}gcv=kTq{+?@BH!zTECj^DGsnnkqrz1Y+wjL3aiKlhCOb#2bL9}Ec_G#*T4|a
zj^AfP$81;-o+$peG6JSigag(MEc?STxsdk#5!z>g_Z{J(_?Gbaj(|Bxm|3+Y*g49m
z0Y%``s0?mX0e{c;t`QcRyyqMKK-iHL%GRu`%}RT(_=!IvKe3$Rus%C}{wBLM|7PmH
z$3Gci_cQOu>#3tNLuyj$O@LDtXT?|A$LIW<&!nZ2l4{-zB{wyRNxx7-hJbg_ES0cS
z8J0i%lq#<udaBQ}ee|r_+b6K~hdL$OFxw2H>i!fKZVqK<w+Dtj_Tl~yd5sTwEh@c_
z->2<;8*^rkM}B4$DhguIT9bs@k`UCD1fZin61}bQ=x&ZfcS|C=TT|e4X^S7$wfSIO
zi@UM0w~vj(w!vY9T`+Fw3B@gAXlsXu<F*mORB8cKd7*e^aROdg5{%~-1rTIzczjw2
zUYHw4<rhn37lP-e`r(a5?s#VnmkWdNI<4#@6KPS0`{3q58hYl;a3VxK@zi`DJiFKj
zudWKgJ7>q?{d1!5;zBn(JHrbv&I-Z{jK9#{K0GcMx6|I<+UHL2d1F_fFZNJn-a{q$
zz(nW%=Ly2obA#~Y94f>qez<Qk)gPg`ZHPN<dkD^}b7W;%Q|XFDWuBOp=Y|PcZYGu1
z*j!CE6ok$sAG8yAU76tqcwKqXXv+>qZHgZ%;@zqAJW-qOkDjVDj2+a7milU1TulJp
zf{L0JGeV=B&`Zk6Ms{5-(wjSw(A)!cS&3~PjKpSIUPj80@kk##0|hgeqIkhc$eVu>
zvKE|*(o-)%)1_Nbb>T+jp1BTnSM5a4O%I@L(>-XU`fKHK$juL8>RnG`-rnahd(YEY
z_`qK<<L>)0`<{C-Z_gggrQ(}@>n#{`)n({ecLBPuJslJ7Itx|nha&g#GGt$oi>ynt
zk#=D^63<CA&`Uf$9to>s5Vs@}gD>s{ztT#u;^_I>AHNHFzdA-|{NF%tAlz*H{{p@L
zlNboU<3k`DxPGD-2tB*~KcT1n>ib7K%+UNB$6zMd^|!PdYpNO0WG@LCK~D<MmN$tC
z`l0uoIuY|dYxc1}L667&fZicOPa|RYx$*CyP+vnq&qRN}J@wJE{qk%V6{GYR270u&
zUvi(OnpDD4Wp&oXMS`BfLheM+)0`UC$_jQiEc7h!j)$J&BPu|twG)9&A+`61;`^WR
zk!<fL>fZ8!vA735-iJkV`CZD4M@d!`>WX4eTNH}gLVq-r2B4uV5Ot-&s4ovkV`VIy
zE~<CMl`SsVNXxl*dJOKLm4F?iXmvY1u(^*aZ&)}ju0J-k5IBQ_v3FD?9-k74zpaYH
zCl}}7pQps(rNyCml8SEcC|BG$$O(^*_rO~xh2!m$L-5jEA3QfJ0JrrxVW(Pm8K&<Y
zmk*8eAV9-;elT7j9RIe4*FA}#TjYs52h$#paKh_LsVL`1;i2&%*wNuih3A4@RErNu
zElvu??%^)jGuj0YOmfFla{}@5@;JP(G{MyG-7?G#S5qBcTCb5B9=MqAb8U?$R+V~V
zZoa#j<70-#7w7t7bfz!*sPfw4J<yXLjP}$Z^cBRShtTWFi$X(c0Lo%qQIq6_ibO9o
z=SO2iTLlJpG*j(WqN1V>RW(htx(z1DXl79XQcFva)!c@prcR`G3`1OFFXHRF5L@4g
zl-^NDA2}I06R7s4EJW7SCCHt75~@zQ7_ArGfSPl!L(%D1qVAHdXjk3dRXb33`Aul#
zvitg77`bUT!S*nw-Sz+$-uomb?x3=}b1%kk+kv@v-Hi!1ZXxi_$KZ`;W9UsMqv84q
z$iA2mzAPWf7pEclLaMy;k`cQ$7IA09BKG7M#4L|Q%z|+AUDyqN<>gErO3%WN3PN>w
zztZ+PEbp)XSKu2M|LK^8;6Uh|2!6+J|8Fr6dMAp3z_XA$e%k`iv954H&w|fPJftPV
zLQjnkf}Ux8{lQU3XOa;Rbl=MQ3V2#-l!Tzi7&v6?c<5>VUZuX51sC)LI|Dp^_J{eI
zngp->J6V-yfJaa|>~epDi2%q@jy{d?wUZPHhO)S_x3aUcwE~>YuB>fMIjMOxP6Ry%
z@IF75Sm1cr*%0(pcW2>tqA+mlS9Sx#0X@R+6IomhMcao@xxQ@Cd=%%VAulZoMHyiz
z&kaFUK`^Qc15uIh$7K*|O2P@h7&u+h;EY?wM&N}tMR@J>QbI2dS2Q{q@ZCEz4R=gP
zz~-I+T-WT59lZgxlwP>K*M-W>1#h1gg||+R#H-5#aNh(%hQPaXm=m6x7mR<O8jZJ3
zjl^GOy5WJb+)s$z&f_<=)1q>D{|IM1H!T3KE(pbgW4Jxq8IMiz!h>qSSQ3N>CU{`)
zXjeQvD-gSe5|$lK*xu%X>ua5GYmXaVUX*}m=Y-=9TIn5KPS`}a?II+fqpEy)c^dW*
ziuVwd+XlN}J@0un?eTdHnxn%9i*mg%zd*CQ`D0#LD5h&#Nm|=s>AvVr_QuehFmz`G
zV^nz(h8D+RXlVl4v%^pw?}euHKr=UPWeU|^O%BEl=|n?yCCW=H9hq0E)ls<$MU~}9
zD#%A#Z9Nj}T9Dk*i=>u8NFeYM8oQ9zJp!qNMkAGqFKxsWWYX#u&0T}qH5Z_1?Nz8h
zYdz}Dxfbo0Z9^07t{~WS$riL<Nz1!_Ck9<{BSu`a1CzGy!ql7Z#+2=MGIn9ab=xp?
z$8DIneG4Y+SdWRjF2LZM7a{9{IwW6^kEBa85qVB5qRwKR5rgoPBM`AB0+B1YToi_=
zd7*?}j~S66cZ=WtfZp%F{%*RcX!x0h-ig3>Jp4`sxBm@#wyy4Y)%Ty`M4&r9P6WH-
zV<6}hcAr3J!Dpdof%hl1*B^r3_p-HwoR-G!_5;w`TMIqY&>^r<l?iY*F@Oc1j(w;5
z41nH2hDpUw@R=0;KmTe*XxI@MQhBnq0-ivt1`69UYg&k9cNqeoNk&TLdMv*Aqko=l
z#IOw-f}3DxyQWy+S?JmHQO5vhQbP?4B?SEx2M9hTdbKeSb_(^)bYxNaBTUi(+i#C$
z9R2t{!Y8!HE0!!lWk~@FvXYRK5{~?ILa!(qwWU$0E~Zl}i6ro%8B}`Lb^74;anX2W
zZaVH7AAy^P`QwHz0&Zj|o?2RrduOEMrXfMt+T)4sN+U%Db?;a=ys;t}@0}Tjedk5u
zlS`ASuKclw;CpnIJKj1w8UH>r7OyM|z@t+LIfChdF>ZK*mh;60w7bhf@y`|Ec>knW
z{Ch<d_RxafN%i;Gq!8RgwYYbzC!U!Tk7wp4(5^<}rcMnzbH=lCQ*qlMPbxh^Z!s0%
z!f=8z0FO)#!cKxzanBe(+&zkL?)AiaDm;zS*f!D|>#1Z<t#HDrm4R4Z5{d;yL71-T
zEwjBbG20hYi$XA&AI|XX01VFy!^q+oOs>zs=!!&)sz{>Zi$Z;pFWRy~jp{3jbw^uC
z5+)DpL}w%IZFxB=%av=7q1N2GYNQtCA+D$xiB<JTsB1-fTOVR;S_!@mq_hsA`WuRr
zuHi`Tq3Rnk6@}B5pnB<<s9$p-Y8cfgUyR1H)}fvdY~Zqk3b6gc4d`GDy_`z!{A;-6
z`js0n^~O6eW!oJXx^4@GTzvz^Zo2`aZn_x5ZeE4l^P7=$Za%`O=;AL;Md;blh&YqL
zJ0%LCtHThsG7KS$Ll81I2w^jW(S1&*fgTx0WhdC#_}w&Ys1aiz^!`u4_n+cFfo~x6
z{%_!C<NpPC1H%Gu;I;+dK<GIjM7Z^XPr%d6X<BNsaK$n35qc&FK&uRm9Tw13?gY@Y
zz*FeD$3Sbr$8!!Jm9=&B!P969HCp@uJgPh1OQS)QOjJu*T$6#B=Y7Ye&B<r;^_hCV
z<A$G^;I{Jr33gWHnQ{#6*OZh06LOj#<8vG^p#%a1-lv2dLH8-wZ3ujVoq#5|X_SS@
zoA>_v$3oi{fNIz{aDV_+U%wAO#_}c0P*+ujvZ6fXq{kyGH3H>@38<lPsV<Ep_+qH|
zxGz5vP8+D)c2A7Qt0z_A;aTZsXxVkGuGrAwg*(Q^Vhh#XrY<k+8Xbf^6NB*ZbV6&5
zEUgdTSmlK`Pj$n47lz=qQv&eZGGDyBCItUHD*<nu5`!mZd*Jzn-gs=f6P}+RfPbDG
zhp*S=;fu@DO=HCyOMLPAl0ZB#$_Y18UEWEk-bG8gYp5G`^tcdyfp}&?H141(6!`8N
zN2|?pxAnQ<FLP*}7lzViC*iNllg)6pTl(CwrOlDZ;1(qwr5)ZqH5&KMO~HoYAttT-
znGI1`Ou)^}55UwcZ%oSY#Dq*GQVlc{iI2_;z_`)~j4LPf>QXSiItdeM(=eQBuQe?Q
z&8bv+RD8uzE~v{2$Jm|*4C`t}V+~<fQDLB`etUvmR!I?Ja`TW}S&O8)W~8_FAhMzX
z(N)ceuWdt0b1yOoy);5EwQn4<M$bga%oV6zx)#;T&O+h*Q&7%j%b8bV$fcW2w4HSo
z+SXoyJ_4`j+$#yb%Q5t_YcTHm8!>6~tr&6jjTm<IMk+)q#>-Zq@SLGkaXE-vn~t!x
z(Fi#^0>Nt;r-UP9bvS~Thah-KFoNa<B5;O3LZ$_x>+F8$N$45y{cfPA<*!_EwCMf;
zy#EG1#X!g%zu$t+9=FgN7$*YY!1zCb-tpi&(Y_OfEpt5dG^^VGfZl*Gs*EtvWZ_!3
zs?P$@g3-d!l;cv6JErm!0@QKWR+ww*zC7nA!b=So3UyJj*^zB`7_|;Np!Y2_QITd$
zJIHhXl-PjRv7MqV^aMK7MWr8j7J7o4wYA6Zw~(_-Ga-<I+a^9E;0D0X0#7PWz!T6c
z;7$ZR3%(CO+>cKV@OtX!x9?*tUAzqSbX8US4i**UATuov`PuO(A^0jw5>Ztgk1`6E
zvOGd>$FM-$JR}eg&Pc=V330e-a3F3Q6^UC%Mqz7j5Y{!iap{lyro`ZZX<>MNSsebk
zHVw}%^20k+d2e6niMK9t!#~dR!#~am!;7nf@c3LG0xt;9E(pc}DvJY`C1T%233&Is
z7`%O6INmtT5C1$P7|+airW*9eU*`JZj$T5q&jnA-2*E#2&cvS4-h`MFo|+$K8YOP-
zamDs7Dz-}X6Z9}0mY<&KNAMD^GySn^I6+A@x24q`Tiboikhp7Td2gQ(iLE2UaAC6#
z&Zzar8TC<^Lv<(U&0)+f4#tcEe@rd#!{nj>OfC(^g4PtwYfZz9rW8zVU^J#<Y*jM4
zv#IhDd{7nRfwCAklqLG1w>BSR`r6RiP=lIE2lT3{t5HVFTVO11AyO);klECR<c2mx
zRW>r35L4BHqz2kts=bU}LT}I*qz;{k+zE40I%_4W7M+g#S*uX8;8e7ndI^SId;^AF
zv;m!GUXJdwu0YS)OEKhp!tkQ2F!IXlsratPi1k}A?uMNhyXi($op~~9&tHtRHC>2W
zo{#X85)rvJ7CvkI;CFHW{FVnGU`Y`C76ib59%H&c{3rP$WHO<*R?z#=0lj|kseaEu
zkJ9hg-&MEgi2qaYohU5m2F5_O_x}(0DJ<arTO1EN#lYkLL+J^6f=+*&2ugn_Jx3Y4
zZp&CzXv;~>337I6!@|*?XM#|Zl9QrSmlSnRaVWjt4D>AQY-k9afcN=VN@exE>7ys`
z{h)EU>bk<~33#@jo>ZQej`RM2o=tHt$Z1_!WIp>!O3z_!ZCGWe_~dg(Xc8vHK<HV8
zr~0~q3hzIK1)x$*eY~GmSY2N}{sfDcEO%r)s;ojqMLF{GvXGe(kGveJy~-@K)a9bN
zCKF9nX>fXUZZe@qOE^9nyT(Kidj7a`Tomq^l!y&&-nh2e3%3l5#O`r1xOJE>o?e)M
zzn+qcmrshsXB)Ec+2%-mz9k%=Z%V{_m&M|lrCxY=hC3~$6W&-IkFT%E#^+Zj;iD@P
z@Q?F@@XQ(~ymFQco;}40FQ4Is*VhK&=|$dzoEvrzbH~%uL-F1j1$bk5CN?!W;bvOa
zZM43(5BI^NGt+SQ=vZuOcE%PSyL%Y#u__Xe&+syqc-L@WY;N(O((}Nkb}wvb_r#6E
zLUC2QFV3j+#EK#hT3bKN%k#qG(hw{xVU&epURf~aRnqd-MPgBN98MmZi}R<}Vdc;Q
z%<IU+w8m77sfa^QPAD1^ebAKbkBV3i6h^zFsUQjC3B8V{8q`)*@I$Xe1)*0$_?4>l
zwxk3pWfjP8?>0$Am0vH0DlewG74dbQrk*cd_IB`iqz{{9)L!ny1t=!?3TCcE$-I+L
zvFJ23pL78_&$tvFr(T5KwU=S&c~@ZA1y>S=>oAxx^s?*Fcj>j5xM?SbU%dhKXP=Ii
zOV*(3f~g3dTR=6J1ivLA2s$YU-pjn;%O5TO1%dFL?GHaHJf8_Z@EPNcfYG$@C$~`N
z9dZDVLEbqO-?8}p{~3A%%l$X#{l6$a8z%za@i7o+ivI>Yd;Cvzd;QQG2(^JwIv#Se
zqCfnoecJv5U^*_9XP`F#Zg%~I!Q<+`r<P!~2-}es13MwBL$d{*1)!OZ@=F2iThkxU
zfKLexxRlDXz>~83KY^aMzxYP#QA+O*=o#Qq)qTSFl+Zh23;>?BwAQ*>wKq`R*=-Fu
zQwVwr+1yV){Txe|twd>g6-p~8I%;cCQB{u2tW@OYrl6&%41;@XF}SM&olW_0dT3@e
zo>-8I7giPEftgf#L)~%T4DF-U9pr}{eG%9-A{n<0ji=fQ$Bsc%atjji*K-na^xisr
zb9)-T*q(r|wq@hpOX6vD<MHn^WAM-<XFNK^1@D{{iT#(w;{!tP@0SGO&2?^gd$T9r
z-s+3j*16-~>qGJU$-cO2j1mom;NIchczi-QUR{ul-9!Cw>tI@6Lhr%pk+^?a4E9c;
z-5p8gN4tA7t?X;dL-FV|XWTzgt+%nbwU2hZ)dx5A1Y=8g5U%eG!s$gy9q)x@1#VbT
z?uSJszL=Bejm71mSX>c?1r>x}br|MU24PlJFc!AN;G*d@IB#k#Ru3<t+DpX9k_a<z
zMq{EEYU4ammEeW4cpsFfgrcja5Mzh-puUE`GW?|}=PyoKO&yB4%qlHMT4@C`>syfC
z(uMd&DzDlO#ME^ly0#s$O+B=?qme<Cmp*(7vc}Fv{*=Xp-YOIldigU}AaD9|6wO|R
z%7v$3<VDwF<b~_eclJeSUULT8Pd|@pZ#_m_c>_kS--sz&Zbr*F=b`q@Q_yqe*=WCF
z6*5=#B5+m)0_H^{a0vl7-yPnwyx=p#2Y%E1;5*S5{$mNh5q|LQ^F-U4R_eT;jzQ1F
z0O(2esbjIgrKKPyU}@cM>m^tSj5;<Dc(%NjCxV~$3vw2GS_(9Rl|3f7X{mi$T3G40
z;G}zNzpke;7Kqxf;}(t<a08dxuls3RVb9ZkjS$gw9rbBS0`SN3Cv^ZOU<9(mM+g(a
zhQLstx`Vu~GB7Kt!H>L-z^C%6)3bn7qlT91mZJJT?N^E_Ep;7TPuo8VYHAriq_k0L
zrR`5jz_7N~!p^QAQ+oYjIw<QHL$Fg=Yb)Rx`#Jz_7Iw0*4wa|<Uz^lfN};9PfuCu=
z&LQ+Z`TX-^fK#gf<HK57DZCRw&&EKNHxPdNK6sxhum3z+-K8s5qO7VK#pM(pgkDu`
z4GN0#k(Hf_qQZ2vHkDxLpc)JtR0XG3PR_-PYx42JnnFCjC=Iuc^}(L0p?G|LG8JAR
zZXX(h+lM6L#*RpAY6-%Yc3NFR@s$gb@WZ{;`1;OF?B5cNm(KLYlS^Fi(h48^`?P5M
z>!dh5GRX<gEp)*@&I-i7s}u0S#u$8Xa}eIW#Roq=lZk^*<lwDM;dp$t3-(TS#XTcE
zao-4U+|lWbhet<XQysxZ;N3Icht@mT*wved`Os!Z(AtJk71Ac39K-v?<F6~CO~#g6
z`n+&MlN+`Xb~pA$Vq;f0F0Rw46JIPZbj2cqZ&r>AmR1B~Syea|Q{l}n_QRAscT6et
z#Ny^CoHeEZr;p0RNyBrouqzFdYhy61Fc@vhw-@J%`UFqZBzmJPj^IlQL_=W;Mh$95
zQ(d)zUTKw}*MOp`I^<VYBe}2yY1Q>eZ|+2L+h8QN4>hohB={oh2|{is^^Qf#(8<Ui
zKNrOU-L$31p1im}79oH7Qq(Ly6}@L(itf|SN83rKqju>k)UP}pt=w)u>tYPL_)3hp
z>S{EvJrnh3oQ&#Im!a&WImll%8u1IN;m02gk0}J+3>SD!@qpI^FL;mhhWAKs`0)qE
zug@1g-Ck&0*-D*v*rD+H!`AOPfX5AO_bWXu^|C+``0TcQsf+-p;}(Q^X{r78T7rup
zXZH&@+Sbb+(})WTBi&17?KuLIJx>4=;Iv=mZRi<PPA#$8R(Xx2u=muuu4l{Xe%dDx
z>bhE&V*7$1I6{jkzzJA_lg8X0{aO16B$g5Ilxj*7iYq^%MssKch=ry=W|f|nUlSnT
z5Nv{;8ao6#3&o#R|Hgd+%XdFW<tb&919~<wfmL`mql;Zvou0y`skbe;TH3lhtsmQF
z2y}{p&{Jq#%I-6QPq6!vq0d6|*M3UiDLy0g6q*o9VPR)&t!<^Xz&k$v8~7B0-iP~C
zzsF$t#mkmbY*eGPx{BXDeh-!IjUuG5I1ibbi73oZMte&Mh7YcR({n4cap%NHY#u~)
zG&}%1NBiTk1xeU5IRe|dXb%OC_E2nX3nEAYXg7VadukvarcL_S#$@c<9*2)^jK=Hd
z`{A#r1>o5QKKSdBaQuA*p*KNeY+dlrbNunnRpI#Jwp9H3Oc9PgSBT$VuLJ(tjE`<j
z$8&3W&-uP4P5hqW8rSQNCnkmCjzMmCXi5;CoEL-Js6s9-cfxHWL-5k_T>N!uBL1~H
z5w9%`Hw`2&)4Dx5I{^2L^2Tkn=$l*oaZ`6Ru5Sy&)qMW5YiMr^ov^gX9rN=&v5c^r
zU!uv(gD|f=fY5Wp)FMx;?1;yC<BPGnFCB}!k};zp8WXD{F}ge)z1hmM=YiH#UsS}n
zp)kS)#c|%K$cV&<?k4nfw4jO#PxEG!*ESjGmDM&NCBKjsw-RZM?MP}LjO6Z-Cf!t2
zLl+|IyAj<o81dbskUVq}awg0}$?TN|bm`;gA%l@MaXt#BEk^UowP;;=78;fkXp5Gi
zV!;wrEL@55MXOP^cnw<5JP&PWorT8JPe#M3tC2Tv8Zu^%MgHP3NMG2BfN}9~9pMb0
ziLP)T<qFqfZgA^!hg%mx*x?P|c3RvvLT^b6bsphG&^Vw+^~M+oJ;6aQ3oE?_sysnM
z>w<*J3Md1Q*`-~#DoroJM&}7Qg3>^k2{?M$b9G!`v)dL-x}Kg*>lSu;4FsA!SJ&5h
z7Lqzgp<!^^uj6*VRf2+=U}V*oz<1>6&jxTdMU@@qrfcXNJ4W~r*T1An`sLT(xh^OY
za=eZjLj=7;grC%&>C(buDkG>0fKrA(@m@btCCbh!?0DYeq4%fYxeBGZ(g=_*6?~S@
z`AnY>$coPh<Inr;s^-+N^>tEp*1jt3yS6{&w%{vuXMy(_k0}H>jdiwRp=T}aK!tZa
z^z6QY&^r<M1iSa&f7eV2srhi{E|`b1>IziU5`2}FD5<DGO}&PPmLn%O4QXl7D9BGh
zU2P_uZXW56buF$Y!N8`zVBARX-9_Ns%Kh8A)FC+(cMXrm?om<LJ3buuO(F0W_~9?-
zgyQw9Bk}g82)uKB7~Z`q8vi~wnzoiUb%GmSoD+bD#yR1IMQ%pvy?uE&e%PH4JXeX&
z?oGjOFE`@g<5l?T?qa-iO&Xr1O;y9ieWP9Q_9;}K3(|1M5Fgw<#uvLr`(Qg2+%=6(
zd?woO@gaC(svq859*yT``{Vh!{&-<wAf8_sf=8wV;5L@Myu=CD)%#&XTL`ueiNU(A
zAe>s^f>Wyev9#D9E2|?glNNUl*O%5tU_Pzwf|?Ml>P*Dyo>Z(Jl#WI1Ntjw2g^5*>
zm{1*q(Pfe7%?U(%x*uv2JW&?yhN37plq3eAtG*b+`#Q}y+)}}=rjg5fRMjb?aw!rE
zijmpWj=1I?#8KHLwhuuJ)m~I%FJjw<I-oaXBC^KKLD7t*gxwsZj+u!xhS^8xb*?!F
zbqiLbV%8#5&Y6$Oc?(cFhbnN!0_1aF(<x`7dgTh#tyzJ}WeboqYXXud4nfM4E+kB;
zLhzVGc#rUc*GK|yunU~Ko#EW>0v9S{&qmtaMjv?9d!S`rlSww9mxZ2L4uGCsf{*~B
zmw+WW=w+7{VuFue+AnbECFtm-^HomQ5;O%OZEL>;tAL^Mx{d{vg`}>h^XxHuExiO8
z?bCf#UZL{ZC-~^PdTHGPQ}?k7P=M6+bRC_meOl_c6di?^X+0H`q{NJ}J3`e)=$Rx1
zyq>P9^K=b8x9+KHNCC<c3uNYesX4~M?`eZ+Kfj~>{9b+i2t%npV{7{rq12TUg4&EJ
z7JQ}?vaWd?^uAK?d`IS&@9{04>1*2S<DvH{0Vl{QY+ap&o}I$cK+gh?A=o*9r|myL
zr-ZGFfzY$SvkLEc;0=VH-TqJT+rR%qy!XL-_&}2beY6jgrcOdZNdYR<XSPaFWui(|
zPF1C*ZZ9!O2}VQUbVH91l~4dS4vN9GU7==z?OVr&VfVxc>=_r1$EK&?FLN_YpSiy*
zjKuRR0`Su5KKT1pq1bn00=~N|ANw{W;FAs6cynzu_7ZeYPIJc_%Y*Rv6em1P2tF~#
z1<$PT#DUG}IQ&2%KHM3D{kNv!z^%DBcvmG3-cpNK)+Auh7%$vE&JV9H%_i95u(8!0
zJBRt<Myin;d)!Timksq)f@8h#;@lwoeR%|4o*#sVsVHAq5P(;fM&R-3nq$Ko>noga
zO^qwIbOoBU_Sf|W;j9`rT-+Xuvs)5yMr$%=7x-aLQ2?PAg#}e%SXdK*<*kWW)D(v$
zZ7GCaDrVHjV_bOz#*{~4R9Q6o@<I&sdUM0joaT?>NGGa2PgLc`Vpwkr>iO$ZQB#jH
zg0Gw*yQ^QFq}+VuG_@hVxtq`%g5-{&NNDXN_z1n$!HDY^j>KLnvJq2F3vLRR$-^fj
zb;J~8jhT*;Df7{>{4`WfUx4C?Gf**oCMsskLh-a&$elEUc6bizR;)qMym=^HG#9xu
zCm?(J2qaHvL*(!h1P@I`;IJt8^7HiQqr&TPgG-wW+?uKM>fGT~O`BWk1&<0hG)%7t
zjUMUtXXqK$>3@u)R5kWx?W|s!3&ZBevrDCU7i_F5v{19#dI?n87NE4xt_v7?X<g9J
z%bp{E=(w(>>kC*aXF+54*|L3oeI|*hy@&SMX(0tTEd?YkbxhaMTpsp)=$OuvMLo#t
zYm|gqSC9Vut5G|Lexeosh4=rF>hafK4D@u4uB`+GhX})i%GAR3-S_O_HC>L?)fr{?
z&G-0)0Q;W6JE(3Yj30l{bNpx&o~`pcWQLc~-V$_52CA^ovooLBFq4uKcwZ52N}Xl0
z$MBiHWqifw`-0CVg=c|hR2)O=g5ELsX`k$?>}|io6YTW9t<w8~Fto7yAJCJ^`}hD3
zD2c#F`;F2YsP;|-z~f^e{65_O5ux`!p+^Yr{{X{A4n<0OGK$NJ4d`m?>QP!&j`H#f
z)Jpx8<s&sc8Y!uvaJpq|9BvwsgllL8FRJ&(#=(JjU~USYU6zH1r$^&{s=LRgMiY8b
zczHzxt!g;lxjY2_Ugw8TZi>Y(_f}xvwaGYeZ8qLGJsOYC^2Xm*Md62Q3-Qhwk$7gd
z7ti&>(~EuZj|-v*$Y}gyT_B!3%^mMvlY#x$<>38GvvKbPFWfrF8TU^JHn82;?u)CN
zJk6xvo4Vbwsm&R?hWg^(kzROZeiYtV?C9Y9&y|sQY?3pTpF19&=s_TQ<MF9cW~Q_)
zt!}uTc6iGmKU~}GhjVL{a3B#EcBJ9drbNsu4#ND>P(m*TvrB?8lQF+02D2(7F}o%X
z3tQ4Kt1$(mi^DNEFBpSzgQ@g_(UqkXRH5j|2tZ|wGpbX3P?jEw!JTzzX=&oGO)bhQ
zYEW5Ki;B`xRPmQ5)l3H7h_nvb+u=wd_!66Y5z|O}+epPn_(iu4L4x}14V{R@zHvwy
zHi7Ge*QhBdn>-hdbC#lf;!G5eos5$46Hqc?A_~S&MDF-W$e%Kcs&653rcOuJlnF=~
zKLlwLJCQi93gN>t;6Er5K0N{O<d2+tw<p{>+~Lk270*UbcvgGDtIP|Y#h!32bVluj
zY9fzdA?buOLGKvW`k|+n0APWqb;Zt|I}IQ#tn`mr+sdLW2<auL3<RJcByefJ;Ag`^
zPWx5HLd*h9ic2q@tFpS5U}dkVm!PB2{gi8O-MV$=Tmell3rQ_?Z3{nLd-v|$=Gpb-
zv`SEl?J$r3$ZMDhcnF|Fw3<Ist*I_gDXKI|!ZA0lW3CUqm)_nU0{3!0i>&S;+^}(z
zfu3L{XvoGMA-KNziWZpA`#~L;xs>9Q(v!+NW^ZMq2|^7m)0o<WyuTo*Mhiho@g<*A
zA=sHYJCx{@&=Zh<P>OthCK}%>=qU!OI)%0kG%ZABjTyH7&W7FolIJ+$OVik66`!fg
zQ$5}Rsyu#<2M9bt@6TzY{-@Qo+XMI6_+X!6Ki++x%I`zP`{)_eg@~vyWMrkAE-O`4
z)u^msR92(9TA9tuP*RqItlR`R-8(A>4=t?1woxhAL?v+ZxL`cIC?5Yjw-|p}6i2%n
zg2$&t;K^Cxcy4hREo>}4z9t@@ZjQt!o5S$^&P@F4qA>jR%wVd(Ks+@s5U(tY#9L=X
z;LSCGc$yG=ePs;ZJ1-5NUY&yvu8hX(XZz#Pg&ufubrhan7=YbloUnx|PLl)OGcM4y
z&~EAp#HIDFxV*^;>lin*I1^@Zczt0So|)u_$Ehg)N}K)8DFooMU_3qD2lotf#_l1W
zcxqal>A<Y)%n#3u#$6M_v6(jg(q^i?>R_xc4Z@PLP|Pk2!orG3EUSyhtm05iFAT=4
z@<>cCjl|TlD9ox!!sN<$45Pa1&hSSkqboDe0IxeU04>R$s84V~U5Yo#QUcLQi#wPf
zzUuYL`GHqg)}W@m6g5?4$S%xBLQyfYx@k8%X>VzB6B;`a*U(Aub(#9TNCGgbc@SdS
zh9ItsU>rOase{KLYuI=LyOJ?eP&{fPibjk@!SLb8A3hAZBSs)^>^Kxno`IYR(~vn~
z3X(>RK>WxK#E-5)^r&ov42>f2sPH;H2s<yhwt2#(*%fXLuJEiQ^vW4Up76-$@q8CF
zOspgFNWA|AdIJ@nK%>`fx7}typ~`=~wBOg)7uQ{Po!Pcv6NI$XHMCDJ?bA#91R1>q
zBMVUNQ&_m^rLq<*7Kl2grLL!AHuPNAUw^&1hE;O5ypHR9-A~&pqnX&0gj9A+_f#39
z1bOTS-GlJaSXtGP34Cgt&}Stzr6FQ!xX`se`|MM=xw+zsD=s6z)Clt<HgDmz-8{^f
zSM`BE5t>qZKOUsT)i5<$+@F3V*t!1MXJ43>Ua36OTCBzq*;qpE8>vkN?=3hvpeJAn
zdR%_<HI<$t1Y$kgLEhggJ=t3mgxv8ePr%dY5Hqon>H?)02|f$FuLwLxSm^zs@T{dh
zrttcKci>~ipVLMuj)&jzF%Wuct$lyr$NjM%`)GCBJKNyv=Y@!fFy!UtptQJ*-@jT^
z(`^ZSRT?}{SB`?xEI2)~s2oqPYQcuSXlxxBja}0s@#3jjc=wW0{QK-&yme*)-Z`@n
zFE5G1<1>Bm`Y9p!^qM3bxGn<wuL;7TTe9%kh9tbcHW06#5{75yhvV@Xet2<-C*C_J
z8h>3Lj_2ox;HlZZcz#I$UR>demsZh|&hW?1J}2Bc!o$p&`{<MyJTy59Hw|*au5kgl
zZEOUt;urI3s<pc(#Nk%jZcQ=y?^QW?c4`0~AMZl77epH!fHziz;F%f0ctlAECPd)b
zSxI<ESzUPTM`uQvH1*2<d})I}PAl=iNu_>RSmaOW1sdort&PK+(s0ZwjiAzt$Ly+j
zOeu}V#FA(X&n56^XS>q<(ZOg>4?uTz2!<9$qAk@2O^I%(h;~9@G;LvFGR6!aWTu9!
zsHozHUX4ac?dozAQg(->r6Q-j7m0+OMr0(^wIh+R69D6y2)_CbgmZsrZL3K!pVBiN
z$z4N`F=#kSMo&U!_b?O;9>o}e+`hrc9l{tk3`Jwdqjd62TH|rZ9ybXILkA<euMx4s
z%Mm^}89p5WRCR7}Z*zlds|TDKXxHkT;9Tnr_Zlj`QbvJ0JaRqYN$|ByYIZ0+GEKPq
zP4qiPx<6L+vY^sS>$10kivVIlXknw{@4WL4rc9Y)_9-+6hF*e=EwA}8CQqJhGB2yF
z&Rf5JJxWSS41_v6JMr$j?;2o9xs{ifn``OVMHgLUfGZekz73tDV`XJ!=D5l$YqRFt
zuwc{|NoDT8|9%50J-gD`%huY=%j3t7H{~w4-~tpD6~T$eX3_e-rRQMzt=n$Icq;Gf
zH*CQ8i2(3G55E%#D7DTWGTEE$e7>%(F1)5Q_UyR_8*b37an3mTlvB~s*=5#8jvR#}
zs^25b)?K*{#f8OYe;@zWufOp(Q)j68bybgOhN~SU^tk+v_xgqaJErca+$`|E=ud;M
zklH&2J*qr4W_-r$32+vA18waw==CeTfzT7=1UnlRcD6q6SmT9Jcpn?k+4$tZAF%s>
z0iU4v!G{M-BGM0(LZ0Web+p0F-38v>UPw+#GSI7%+hUZJRiM1G9M$z|+2v>Y$f9c8
zIi(P{Pt3ut$!S!5>4aVm{zl;a>zo{Xd|3rPIIocEEeg-f@y1_Q`r@NYV{zc>IDD}w
z5npYI$C2A|u<r^guG7Ns!s2+s%O5W;^~SffynjDA5)Vvt!F}W0@$~FaJTZ&*d9trb
z2yk<sAJ$b<#Wi^oP*G<7i{0aV@gM>A<b2Ja>w*o9F4*1~fIXvPaBGhjo|+Mg=jR0D
zfzi%*m}>7;g7mLT!?9<$FYfB|$0LN^gX2T6XQV&w80=}1o@(Nv4IO@js2?t<@xZzD
z0cP&oxrKh1Q_S_6C@igu!;F$pOe&zwt%}2Fs=LtzVN`jc=t@&!QD0M!H?$}QJ-HER
z&k99bW)SM*Tu?*kRYbd>GAR(Fds@)j*^ZhTTHJCaAy9{9%I(@}q!koU?Uf^0!_X>P
zkW|x!<oa%;s)KS{pHY4hHBE@D)nq~)NNDXqa(g#2dj=!BXDG6%5_39-AiHxgQro+b
z*2!bNL#X)1B7furqzxK@)V^U<exni7Q;(?LQiOG<!LKbG?u}jsbjPBe!Oz~Q(g`kQ
zE^sR%`0_mAo$Za*39X>+G2o*KCF2Am$v#6t6TkHXO0R+7+p%MZfra29_4W6^|J{7y
zH65c~f{7*t(icK=U#RSZ4?g%Obti=-D=N@vav!COSGh+XdBoVug$oy=ySv+fQkCPf
zwo+klzx}p3Z`ZC}rovqJz2%l$j5<@xp|-EN<{EQc>w=q}^X|LvHc(Xko<L}oq~NP^
zf~ZvCwr$%`T3U*K{p(-Gy3U$C8>7dJF?M$=kE!gWNt4Wb)%kiRJ)7>M=eq2&OU?CV
zoi#a;%Im%U?QefG=V<%MC!ahvN%=z$J!I;yG;>@}PmeiQ&+#2Wq{fKvcx~0seMgXe
zO%?YQp=LwC8wfqC^c0S=3c^gvP|zc|R8ME)SW9nzBZz7JrK(ey$<0|_pVz>!&=c@%
zORa6ZaD>LoDjbPGk0lNGFJUqy4+wQnQ5g8CVM9Ts*V)qzCw|869-asY2t;ynDhdmV
zO#+aL%1V^+J6KUm_)+aSJ-DC>w@t{!j?t-jctJ6qJGqqhvjlIRlaJ?DX~<bN{&9K+
zo|x&2N2j{tHCotDFHgt*D^l^%`V?B-IDEG)1D~u<!h07aQ{lzqzVV)T^VA@GdPNfc
zwk8yhPxms@IX*Wp4$sbw!`?9g*gi-J3<7X%gBLC@cg79PUIbwjo|+$vdnbDlRBpIu
zjIT)+y1Bsv_l=3g!;_-${DKHPH$TX<-tHaAFVl1vys$6?4~>t+y(6P=|JV@h8tj2>
zw9EI64a5C};9aA`v2$1?whxcR70uqbsELZNE*NX-BCxzV3@hp)v9d89i)v$NZNo9C
zI2_}PBQS#C8<G=@-mD<>WCf!yFWf|XMld?_qR^(qq6uzjN^m9koKYF)jlPCb45!6)
zjLcBI9>J#;r0Oc<m6jnsw-71i^@uBPMr>spVyme5>N^R(UXy|<s=5)8)%A$3rBbVJ
zLUMCEGCO;b(>)kj9fV^GA=yk=5{OBRl=dEE5QrI6g^6uck%VNHlwNxc!rKZF*pv*f
zx-hs_d%?Nd70xwMcn;V(l@oa71Rj;1SFtyI^8FByO`ATUi_klapZVXfMBlMyi~-O)
z(Mzxp81#iz$ku8(oK{@F-~yYJlfa|0f~AfNE;^<VB%~HqPU`}jj%!<AAf0#PjW?QY
z0Z<Uo%UV{=g|UHZOLJuC+7`CDmbPWHt+IRk@yE@1bLPy!Nhh6TpdvtNsrza+HkH#C
zS^EW_2@@t5nCZTn_(;#8<9bHT-=QzOo<qkKx{jc$>quqle%deCY2IEb7`=}--gx8K
zGwJ;a@`8iPTKEbqy0+@fOg3k!w*J)i24;kv!me9et2q9c>iMMFPE=24tSk3R*{Siw
zDm(}9bPmt|FTgVj?}X6%BTQO&#);r}{PywSQ_Jl~{b45k=Dx1pUP8|W&Tj7T^zuPu
zWHd4}vrI!qrD@3}@M;9T3OGHyxElA&EW*~|ad=>U0bX5OjStqf(!RFfnH6z(jaK(x
zYYXtoiX^JM1pMQaBz$;bF5W&b3x7W=39p?V!DSS;6Y<WuY52>6Fe*7`{PRqIymwwO
zo?qyN$EJGW`FT-zd`c+pALEbP`do4E#1Pz0HFi_4FSd30U~8u*?wb^emsX_WiFsjo
ze0CUaAL@p!tu6+9w@~Hn9_dXb8ID(%M-X=Y1fet4qbL5lA_h-RPa^PQ@gO00@0dXB
z>~_W7BLcB|bTIZ#h{5g&DcCtG0oQf~;i}FMoLcFFwd#o67KhVY<FJ;B=!~9hEN@B0
ztg2W{quLu+5`{6exT8yAFr0RFFqK{p)m>YvpGh^<l<JR$cvrNgsME5WfnG~#CMJv+
zY^H{+($tWIUNu!-85Lh?O%39+^N~<ejo9iIL{zsTyt0{qYePzFFH&e}<LYQ_srI5P
z8xU91jFhGh1G{WST5~5YaSM+(@|*@lF=FbQk<ikK6dsp4jB9B_S})<*Qi0%xY<N{g
z!>v3JuI1ivt@MCvl`C8;2)Z&?xEH$-cAoGp@<m`_5P}K$@a%95nlMOH0ugt=7;F0{
z#`SLqQ1&HwS+#fPop+iqsQtnVQoZ~KxpwVZvu>@Pod`$@NMCBJu=KKxA^I}uCG{w{
z`1tsk{dPt(DKIG`&38L+;vZRIs}40ku5HNBy=Ain4%z25Yu1>C3!U@80}q%oI;Q6k
z*aS5lv-OiRXU;U|>6pq1@Omym$SOB|*>y}WohMl8v(m9EuDHV7Psas*y>|;qDOzm{
zda~tO*EtG3x894CjZ~s(4ep1YLfvDIhu-nvv&U>0wKf2H$A^WU4O`X`ya(T#;&|Zc
z{SOQazW)@*AV|=i$m*Wh@_vW|Tz<?Sua6Ww-qSnCK+nn96)vvs{I}wVn3y>J+bBSV
z@?G+KD77c>IX$te5sxja$M&&F*fA~+Ppr(t+m|<D|Fz9{Y;iQ6UYdZHR;A<hlXLLa
zS;ctc<P0jm96T{U8V}5%jhy0*dnS3{u{oi5dVVDCo9Ky0XSm`03&QcyB~f^NRj8RY
z?Zb2O@bs)8+%v)%caIeGXg^2M){YLs-DASAt&@Q3bTV~tuddF*YpZijim0t^uDG$o
z1Gn^g;%TbHH&02$vkOA-^vn=Ep?rMH!|~=xNd#*=c8`d_LlX(t=`p5`Q1fZr$m`xc
zCJHwXi7-RbG{f7~-Jv+6!ULCd#^Cb7DY&3J31@ew;`FX`ENMu<w6aLds!PU<x>U?;
z%)pe|RE#W&!r+_`^kn#>J;fKT89``B_AyGYJ=Kd#57Z?3p(-;L<A!#lr>o7hBFWm;
zQ0bLcl%cY|9;t;Th$$?imF++jm0dU?7ggUz<u=HGFR`H$iS-?bClHgl%%pnD>FndO
z2g=JARauL$vMPkqI)_))81N+$b}1dbgdd^T+>E3SLbIt1K{XlhEQui80^w2W2hTDe
zc$9g<qu2xf#l8qA^ha<(AVTv)5LFO`xWafO7bauu^s&a;nhweQ@(vVt1J?yL3qP$(
z#aMNuZ3{vvB*8*29aBA?z@e7~gkA!KfFqdbi*8#|Z9SpBV7i~~qjFlxmZ~1l-b>{K
zQCUW<Kl|*n2B<14KwW<M<))k(Ty*Yz_uXge1a&W6Q-G8$S9vKeJ%_%`y2i3)%Z%mL
zGpoFPulkZ|T_6_#r79J=hRR9F+4n7I>AmaP+9z1(oM)bS#(-PORAAOIK|^KjIl8ZY
zOte&Y7z;eL%38>2IdI#C1)<$<p*Ik6HZ0@@LhsL^Q6moE*_PUY3h(%^fIHE?f#4e$
z{|$a0^#ku?{s?@uPvF~+L4*3>?BaGTTp0fTfk;eDMnOR#$_c*miV6chr-zo*;?d;|
zxO-+kZkd#br%x)vn-^8%(~X_@+qs2!c4Z3gpAm*f=fvWfMaj5tN-!Rr9)Ud*BXRe*
zNbDXTVN%KO9!)hhDuUsQmzGB2z-7s_!AbbwoOHZ*W;#ARF9okI<FO?^czmV@9;V&h
zIoJ*BX|<FJYI~0-c8v%kG^26P*l=Di0(-~B;{J(=cw}k{9-W?uho?mmtTA|dZ6RJ?
zk&c%a#TwQ3!s1Bm86AS#25G9u06s$iAsCAt-5w@6!S-H%T;J-0i>sZD(!0Jl9M|?l
z;D%v|xMpY)E*q4H3;WV=R&NGYwx(cVV+!UpreRJ?7G}5PU~)|gMioa9d=aK4w<FaD
zT?AlzW}pF{qB+S64JiRAPM{TTsKDfLqfk{*imJ*oGh<pAp;tzwS3poj<Q5>VsSVL>
z8jV5tHMS$3mQ~3`6Y4rlJzr97C(@e;y!OG!?HG*Irgp>#c%>BxEiFeF|A>*a&oOm`
zUUQqN<4dB-i*Kwa1Z#QyQiN2d!KW|+o_T>(b3yPcRs<SV7g-pNnEWUt7R4i_C<$3*
zStzJ1Kv88eX3m@SC)f=P?N|KV-vX<bKxE-(Rhqu20-s=^`Y1IV2m;flO*8PYu#*k7
zW$nwtM8MEX;1aL|2EFtTUxUtdj)k7?qq;$zCl#eHpw<O9fkNF+q@H9kbzOb&bX;|G
zs$&%J1Xw+nz^7-Gdb5^QAe3#^Ir>uSnzF?Tonwy)1{O>Ll?`3z^wUo_0MxVD@_H7P
zef{;<&9<hpeER99&AQ&V%InK6C8+CI$e4+W)aP#i^lWH*An+{owBME;sPqQHPGK$W
ze+qLhpW}Go9UlWBXJg>86G6|y?nGh1_puU(e)tg%>?8a>AoK{nUMf9TDm;aNr|v**
zZXO5<3PDOr8cK>vP_1q(m6dRMa8W%TTvUrabBb`^{5%4$8UMJX7H?iyhBwbI!qY1f
zaSs9Y<f0_JdU64tT9|}~X2#;-St)pIZWdvfiMz(c(TXPECfde(sNDW`QYt>YC>{Si
zJqmwYm55iCM46m=FD-S$M;9d#Qh|74W*GL24Z=-b&e+oKgsmOURD15Yr7r-t4i3T2
zK_S@KOk3L*j7O%W;^}$3&!`~WF~XM?IvIaolSc)ajF*>1;+gqj#>VdKRe}N^+%|;j
zQwow0-PYlW9eupsuqd2cs?_oBxNSlz_RP$~j<G3N-xG;TyP|Q$h%8(-x(MeCFUDy@
z%doV&5c4|<Frz6O6RT4&zA_P0sr)7=+X~fQZ=UARAOO>;`jS1+p5}*^j1ZK@2cn@U
z12ZR#GU=w&a$8CGRnXoRQ&E)E)FCP-kM^`4v2C4*Z|g#Aa|fcS_M&KcqpMnxP{aMz
z9Z0I{LRw=t(wYdgdLFB+CiKc2!D~fURUwKHjpMbFn_3u6NN%h{dTT9`8cPsSo`!(D
zC<M@!2j_<)tSAzZrLl-FOGHLlI&#ah3AqB4Ru`kPw!+wORr=1JJ;%UBuM+`J;XoYM
z`z@h;*?KN(UuDY#6)7*h^pAGTm@(#<V(QeX_{Tr~VW6Uymg>`|V}g+2Be+Su2~gT!
z!C&l4F1f_)SBtCNFPO?os{^vml}*%sskKv1IR$NPZ3Z@a>AK^_jWfsh?Ac@PYgL{A
zuJdjEq*R;Ur{H}4`RAj!Sgoo~SiO3+sf*KbYj0)stQytz1Y@fT^||O=!C9Y^Y`VV8
zf|&p(C<}T5r`|(RQIS!91qB5-@4WNO{dF&MO|DzuohVKOyn*E%(EB4yUp^_gZw0+S
z;`seqS9!h9<LmJhTK`WFRM_Jdg8xnJ**G44g5Ib5KgMSafltsI)HBGyPT@?8E43%^
zd3bmsJUkNFnK>w@Sg7H5$my}At$1{46ZXt1!V@bh@bAmo@#<M6c<GF6{PUs`ym(3)
zo?H@#mru&VKh7@0i>tG7=ePjuof2WPvplgNg{m(GcMOZiwszXrF~Rr?t?%DYO~P{v
zLh;Dt5Ii*_91l-%!Yj-Du<zVVytFumsxOlYE*g8s2jPLqp@gOvwi13f_xKW2zPP&9
z8Jk+YaUa#^?ZZQGW49-6XyNfeez<#d7@nLF&j`m0OCs^g%0xUdD-m~&2*s^K2)-dc
zyq*uQ9f4a03zX5OPVb^hCtTLxjyon&InFP_t+e#(I)ia(X9U)b%EskmOK|q^5}ZQl
zoj$e^OZzG?qcsnc>N7C8E*&!(GcmC;0e#s)=*|vC8{yZQ;)Nz!T=m~;%?L(SaxiML
z;xMYa1tWSp&{$J}igNzSRLkO45HK}VRwalmDWfWDp+f6Ma!VKD>e~siW`e4j7Pgh}
zYeQs3GeOs6>h==Uu)%X9sBFSZiV;>)f@oeZrlJaQl~qWrsYP-vuU%V-jD|`iRpueC
zB#l-&5%I-INGeN3N?97xE3=VXL(tWfqOw|`t1{14Lls)9d}QTSm^FR&AEfJF91lIE
zm$IPJOG;_rwp|~%U+V)E-SID#QyKdbIPA+_M@vCPpwP<#N-rradoMvs$8<k|LQCzl
z%1h<$z4u-NM17G26x~nf=sLQl?j>-ky!Pq10I#JKm|i+hp|bWI**@J<FD<oSA!Q_B
z*&hmBL)X$VtqYDiu4h#U4!XC>>YCc8rJ!i0qwELR@nIF;@xU9X3I{^bhCT22>kJIl
z4=Q<r1)hc8@$uh)r*rJS<Kg$805}kO>H_l#<Fk(r82B9^1O+`;X9x6Dujfp~=j7sQ
z*1f!a5gi?i-0VD55PVK|PcOq`OB<m6dT*ZJi4U(Hgjdd_tvxvvZ(ml9w+Oz!t<A%$
zr{v&oXB6Xar<Y;(#4v2>b;AR*V)2({X%6_!DWok;#A8%}k4z7wg$*F+{7g&g^Yat%
z?=zC90Hg8jyhuDbJ(~N{@bv6>ytXO>Z=RBc`zH9|_F*2hw^6ulSTHs;yW;M#(RgBB
zI(8CZw+{C=>ECZ?cE@8gGO%l?AMPCHhR0_I;LX!A@Y<R}+&?)EI|g}EIXY9_x#QkR
zQ3PZR*Tb=W2*K7waQ5=r-2u33LK1Eo7Ky97LuhdmuxV-;Hq30qrBfPl-lP_sGqoEl
zMl@h{XDOyO=V4-92F6#(=0>B3fD`nZX>%JA+)d_{PVQ?+_C;-S5Gqr`(N>;|DWe7(
z=v5MWwFHvMvqxK+TV8=ETHI8EEQc01tE~qqRCCI)7hc|k@X8iMRJ9<qtR5jH)r4Rj
zp+_4_u*C>^g$0N%DMb?HdI~|9MChe3lFCYvP+Wkdk~}07rX!&s1!=_@$giettu01T
zO$nnE<piDP@u;FwtY%b{RnUqPd{womF0aA7S@ZuO-3h_>M`vY0LSJJ08n~`6seM^k
z9sd%129~qy$HU7m_0qNUQdxUH3qhSHkl3YODyMUGp33PQ?UVXbpFNec&!^*Bx94ly
zUR!V)2qo<o6t%8nTGvwNshqY2ES1xKt*flyrfq9&t$MRd1GNFibiSY~Sm=B~R$XRn
z7}eJw+Wv+B9T)@4*yGCda(o<L{zTW(yuimp??m7m2s?${e|-7>7w`#sAAh(XpMHD*
zpD_f!!M%NOC-B^i+9UMb2t8K;(8a*d*DnAG2}vl*FM`wU6SDEpf>J!QvJ(Hguno_w
z%E6P%lkvvc`387zUtEee&dkMg%hPCQlW_mkc<h-FhmD=y*h%~N?1~ILwKxU$O-{mt
zQ<L%1vQ+$)miDQ+LA0+fCUeS*i{tUZxmkF1c?@pr@xYA)nx=Dnb!iIKUmStyj~8iE
zUtXEOAiz#8q>WC+t0xukzO=TB6Y(tJx`Vd$re3wk#!^v+5`GbQV5&F%vN#IQEzQ8a
zlVh;8%Z=e`CIxzAW+EP%nt{C&(s0L^1Uxu97uR?DV?$2>?w*v4O|;PG*0|xCVM(}U
zVKX+)ZN+6X+Hk@2E}S`~8_P#FV`gVLCbtw~VnYtbR;OV^Nu24nGPEQb-MJwcR1j{K
zol<_8RC@6~s7wk%b!IFk4eLg4dlP|DX6h2k7)o$hLWv!ho{gBye594tpoq}R>*zy#
zZ7V{{8xT>`g0Si)gjI39s*W(MLy{~k0hd@-j@ZINB+}BQ5}xUVXkt+@Vsdg2o1KM>
zl0p>Ll%s%_wWy{TCDlbJBkU@;PlK^5Rj)xyTvu0b`Y>s@j#{@gWT%QzQC5k0v*-N@
zc#fB2hnhD2Xx{iU<n$6S2JW-)`cJ^K`;P~jUiLcrqT83&wO{46)HMd)LoiVp3qFOt
zj;<{bX(^}_@^e%lJ)I}8=p3yJHd@zGFRS)+p2EUN*VR({wA6mxTjd2N3p#r}?YD5#
z{dG-yEvqC2JiAZZ+OPApZGotLHte~&o{ni-YEbvLWpzwTZR=VBmXxIp9sh5kGCJ2b
zTv%&sV<7xa6axWgA@@0#UwlRg_Q&U+5{v^votg)RV^oI&ct+`QTQ>JI9Vhfg3?2$s
zRb>!xu7aJ53!L3O;NszF;OFe-0snwtq$H)m>ApGnxNlAtA=iM{&Z@%GgxJfc=Hk`U
zvkAK#{Pnaf0w<ZkOTa_Z60v&>)l^?3w)aJ2V}~E^nG%MF=fqM~CE|f8NqBZqvay@b
zFOH<ja$<QGyt;z+czKwallGnwfw;Zb4^PjG$16*c%(RV9%<-jS^TCTNV(`H9Alx>}
z6T8OwQH}ZGz8SQ?vqE@}Xl(0orvmiEog;a_2~mV=HeOksgntpxkIyHJCsWyt3&-y9
z;do?r9QIC%G|5D7>J8`h60mzpGS;_wV8@tf+%qi=JE=_9b%$Zy&_rB4p$Jz_uEAw9
zJ8<5#Zk#%?3rk0|U}kp}CbyPgN^21&HRWM+Whw?2Mx!@B%)qWEKO7y|A*Q2pOL`z`
z61-6!>w~J)2#g=pis8MTCXKvcQ${tVZY*j^DlD!*N;)lQS`IQwt5HCUn?v{|H+Leg
zu??|Jw7B(6h^=ct0s)mu=%o`<X#`zLaS2ixN%{Fm$j(7>UOv@hIf{AhQbuv58coVj
zPE}V?P2g2BDhNF)y2^4wkD(DGdJa98k|HS0c4=uDa&z-Ae(ZP$(~faUD*}gr^#5`9
zAAnIDS-Uqb69oe0oO2GNnbAly$~mJPKmvqB5F+QCa|VNR!~q*@48{gyf=LcwY~!5Q
zUVGO$2XNed{^w~?-nL%s-TU7E_kH)iXZN_ftE;MehN+*P(-m5I_j4-+A2xg%(`f3L
z&|%}JDR{9AU75<O@+nj4YD-Ayjp^zf>r?QfsruFRd0#3o;i2*pFidCNyg%mE-!mIM
z>rx;hG?<@xSSOFUWm%>(pQ?x7i-I5XGq1|8&e8Rt<h2MH?(?2h86FcbOjDp^8utkl
zUYD*wN7%3&&l6NkQ~NB}6KqWD=}iA4;9UuRdRL(9=?Z+Geex+j{}cb>Q-P_#k5JP)
z)t#pFMDhLYZ&Z7-|IWLpDXqZ3UVULtwI}M1YLDR4oDh_7w{QP}@bvJ3-T5UpoSqer
zm)F?v{4x`sUz&_J*IDuU>J+@TQjQnK<MAmHzIKM;SVxqotT5bO<cnJhe6h3KM_?0y
zGgG1kgdBF_C@>PmHa!9_Er`Jr6Fu?VY=6AAGz1ssdPoTBi!<YWacWEe9;oh*hZ^lA
z1a{D#%RSQ8AJ5Ja#W>1URM{ZxYjMHdwSBR#)d3HVaKzDZ0XR4^Ktf+%JUmQ7>ah`c
zer^KJ&IrWI%i?f&j2Dh~h2r#-Se%*^je8pg;6Vw)cU5`d#{9ujPcR;A3&u6Y{cu~2
zTw_EC9_Wb0z3qv(bwm=b9U<ZE_(EJirB;L9oaO>duF2_!UV#~Hg=UN@G+}6Z462d?
zMB#a2NHSjr?xBs@%Zrq7H{1!ip^ixNAAqzFSJW5SFluN6vNJQa_4~~N7IqY*iQ6pc
zNQyThJTe|(aRx+~(-2|HM5ur+C@TlSxp@f8&KC8Tjz9@PgGJ$m2<Sri&JBTFxBxFk
zjtx?$nX^A^HqE(^A{S3gNtG~GV3(TK-Ggl7HAP|CEE%v$xXk-82?!ID4Ty@3L1=gw
z`~w2u>FtTa!UAdTih5T99z(%@skxq};KD|DB~yS<;89A9#)jI{n4hNdE8x(04T3_|
z!L0%Sjn`2CBkWW@mRDdRNVsKQZkbL%uwGu9b?NKjIaQup1yO>5TXjrm@H{^gufdKC
zc2F>l=a|O4JSLR5ui(V%DzK^YJWp3?1P||v`I*i-2^N;)mMg@tbA=rwJV$sDVycdR
z=L&jPQg}R8u1{Cz^q}jh_&yc?Ow=7+0gnFVXP@J%&%e<8TXvktu>#*GABs8@=RQ^5
zM;~#i`M=>as>b*JhWe6nIQQ;{0rGhr1bqGZY)|&>t`cXbnilHv`Q9Mdy|ytQudL6+
zGfNG4b5kZhx~>#|zq$|~UsH^ab`;{B_33zSUZRAMu{b^^PLx|Db{6|%XQ@B-)CXz6
zdun<t?x_;>C#vb5N+(fx&H^Q0obK|LaMce_OmW52GrjQ4EPqjp&Z7AI@XXY3oRskN
zQ32Yqb{E`V-&?LBAZm8So(5O!ZT1kw=Y+jOop5jcKs+!^6lF(*gsbs5JR%0?rY7OV
z`AK+wK_p&U8iOO_y>YB72&bpU;l(9ZE$n`Jb^;#lh`}vIF4#LPK%f_d+o~OLZRr5)
zmhkqTkx{r?pm#${Jhn6^;g(6I*g0ny){UveyrBh{-jI*Z$_$JuvWTaky~vD~tOV4h
zgop>AR=hxfmmlqp+z1!#6_JGz&d3fK3|oi`YI7}^Hhv6Bi%XCyA*xxR!?8kE0Z_Vy
zixCK@OlgQql2A1v34sE%0HX;3qT2Y}&P#ygD;{V}Mwp~WixNu^rDqU@W)cvZ`8-Y_
zM^6>-as4neJ4?(GPN#9Di4<6sOL~gY3}d1RNeRh_h>S!~XfS;Iec|Tm4mS@s&D~|c
zq^ww)HsL^5(4#BpaWHbXI-;@RDKM!tngR)p=M^aQ#&o7BWu{C)i*t23f0l!c^?CIB
ztXIKCZ@fP9@f?l!q5z}Pcx?qV1zhzRn2+a}m;1aY*2#PdhAhW@RX3gG*-1jk5gMvI
zr_bGc@4XsKSO@d64nm8N;r;8u$@9#|bA$%BtcT@Uj&&0<+%k>(yf*Wxv5o94S3qQ0
z-YfGl&!4#h-XEzvrm6HRxdL5JSI{HCKKt-veE!iV`0^9^44;Yu{ajS)mtWyK0p9mt
z%V%Y;v7BSRPe16+&n?y6C!c(bzyIw$d?j^#C*T}jSq+a~62i)7cCzcOfv>N?p1{AV
zkElNRx6o#62pH|oE=j{H8*}jD8Y|w}mX8mvEyrKC6ydL%^YOQB1^DZxEWEhbfYXyA
zad=c19%u{|Rp*EM8~wD^P#&KijOXUZ;YfQF4z@&LPqo0R+6nj9Ibv_aKpY?Ifn#Ib
zak4W&fai_-MS-0X=)E#eRN90f9BcEy-r9lKRbemrJ#alc8}bL?&LN(7fFWs%Kkjex
z!mespJUA>EXS!1HcvlKuns3E(b48(w%6n<KgyBo#aBN~A_6&8!@re<5YrRc_AYW>9
zbD=ZtYxcoCO>Velhy%74^~0_8-nhQT54R1E#jT@ExMoBOZk<$y+h#OieMcFVjV#5&
z;iVe*##g3ebg=~;0=`lC$rzp$izaIbs*JwK=4&CNJdnee3;Xp$Zpc7UdxMb`?uLf^
zR4kuA8;y-kNK4OxNubB!7KEOry3+XeCL5AdB-|B{B}jU7idg`cj39wtXmS#w%qGMN
z1d|wkN_d-^o&lREM=Lu<(z*d>6d0Pgc9?{$3{%-b!Pig{dS-Jnl9J*P8xxJNkT3-L
z2f@e37w+zEaCLVU(7DLg1s<Mm@bdD6w<x`eiZW^P3V46y-NuHgHw7yN6<W_cdLu+=
z%&#D&PvbmS{%|>@gGRW}X#|e`Iy5$V)}e<T>myib3YMy@x?cq?LV?D5cuZiZG#)Ew
zd2U%B&odv>6s)*d0-?u;@+ytjVt#JbF#*H-U>z(^*zg=-!*aZDmB#!Ef&>`1ET@3X
zEy2Qa9OA>CX->K0XCa7GIo8Ros^d!D6L`uM?DT%6^XgcE@;`yz*Pnf%floQ%rw5;c
z-p3z&Am6LFgty;HT|a*ODJC`!LzJT%e6Q*UR{<a4H&7tgPyP)!Y^u9Mg9DfQ!tT)I
z1U#|Og14_O#q%p{czQt!o?D!Tb90mM(y|l@bIo{uaXe0m3VWo3A!axZwMF95(P0us
zhKZ_*!yBtCcy^{0$2$@w^bEwwaS=G$?vMNG>~VU6Kc1ZxhZAF>1%%!>(BgroMTH$3
z<$}{=JaKxgANGr)+t=WQyE$7TeSm<^1veLY;EoD!Z4uDjQtsYbPwZ<B5Cxee^(Wxm
zv?Oh?^o50ycztaWKDs^+XXnRaSECc|8|H=!OA@iCc@S>MABdX@L@hRY;_e1#?B+0*
zVZPW=?u-q^E&{zc+&4J~cTFn5UDGRZ>$DnN-BpE+oz+;;A&PHU5vJ5+VO(h{MvKzx
zC@_i}B+Ly%d7_uV%?0@}-pGsg)Sl;M1`vD>0zo$nEl$VQwJRme7>~4!Y=NGDiqK0-
z)jAKfmk0@<G^bh<keRFkz!Z~&s*)z);#&Vya#|iK!$k}<z)7C&*&bG;O9;#b4!DGl
z_N-3SU~F70qGBQ?Yz>Bgpf9|9yhO!$z}3wiF0QWdm17@ocg;OLT;U<$b9WN}x;jgE
zn=eiL_i*cJY^ZEZdSk=m09?*mSh8fvZ{yk602N@Eu7w@qoT<Tv$LUo(<`gVKfdg^*
zLtU|Ag_f@UwHD_-8<O5Qm$tgPTI+@7d43lwSFY5~aoQKZZ>E=&lxTyR2?^%qAZhmA
zGC$An*s((^!v;TV)+{ZJ_rO8bZ2T<8`(@o6BS>fcRH5qLHf`FZl~?ydC;0e%unr!t
zU%y`a-Z;K*!GZ<9mF4(90)gP;F~3J%kNX^}r^fFQC<<1uzy7*5XP9~DYuB#TAj?Ht
zRT`bwS1=?rmFoK=@bNsarLNc0uaw>sdipfF0-u5&!&*J;=-g-87XmSEmEz+$!cqCB
z64HMB(FgeQ(~ogU>iJ1PxM<W^5gXC4`pWO!t}mSBb2|#?21;1l+m7GyRoe8lUQ~Jy
zO-#Y@SpvBwSvV#jd3bCDPE1Y2le3cq9LWL{BVJw^hbLwQ;n7ZS9PJFn>B)&W(G`m`
z67rs%5rwnUqXl~DI59dA4>o(@Xonx3ni_&f+nw>mWPdy*A@IRrF}SV36T55OB;@qO
zGt>RFS45th5{N^s?gC3LALWBv3q5gDz9;Uj3B%5EQDb=nG~hkj9wwkm#^V#?aC~&A
zs6G$5ekfjC8jJU~Tk+TJ83Mf+oRDjsm>h{0mnY+-oZnsNi(8A`aCe=A%){m0CPv}p
zOe1b@4#c`bCv2~gkhk50TgRs3wuyzfeR?(SoZpJ;XEbBu_*yI;QG%HbxtLI4#ppsK
z+Vc&vO-4&*B+3%IVV5h=D~R)!5Z4(wLH63*+-!m2$ck*-ux%6O%$WykR-X3m3~eC-
zOA3+&Sf+FVRJs7oD%(^ENiAjxLsO8RYC)PPJi1j>p7s)P0Un2yP|2|lTRNxJ7-2H8
zlS1-~3N)E5h>VJXpML;6M6tOG&|ExS;N<Qk?hI#vn+ro(aUU;tcnj$09xhIB7x!>;
zhL@ZxFDsSi-wiy4zMif(rKkujhHz>CE`R78AWV2rIdHHq@0D+}Vh<iahg!*4O$%eg
z;0z8<#iG6X>Z@9#=gK2W0kZMZ)HpjrlS+tS;I#-h4u<C7U#`r;Mof4Sd^Ey@1DaW$
z57pHiTQ+!xg#;OAd=MZU7|a<T1oqUaQ?-1ACBtr(W#cBqIHZE-_&JzYg_q33&&SWt
z&%r^=%+I>`T~W#Mp**jt79?Of4&de=0p3$hO^vpC3xP#vd0vB`gM+)f=cB%_)xl|Y
zJkRS8s00p`9k0)HLWf(XDZdi@6!i422b$jXaQmZkzf*j24}6BFlpWO^oyUZoQhzGc
zRq2Ex{R^%9C;0x0&+(H$?;n5v7;8Hxp)fcKLt+d_a&(86K+eUk7aRn9eF;4IF4(!D
zjpI8!&4!b6GjPxFKpYqsg@;9v9h;blS6AlZ?e*FC>sFhDiUv`BL3m`WD-L#eYn>Ml
zGzkDZLUDAopM<4@@#shaQKLWZD!0eI)qQb(W++}*5Q(QG{5>?%S6~>8n+qh=EFOTJ
z6<0}l30Wq3;KJ+(oEjgD`|CY$OQ9p~EceAd)gidMItaIvy5rh>d)W@eff4?AUKHtL
zogtzY-Ed!n1I|nd!Ko>}cxk1C%JU;}YDO5ImvH^e)EKR|_s}RW++I3Z!rQ^vJIovV
z+eB543cy36ME8%2$JR1;Y^(Cat!)PE9G{7uQ;J0CRpO5M!?1I4JFcGAjOCizE5y{A
zOpGZqVpLut+C}NLWXGW-(GR(?-pCMDmlN%V!Z>&2gbqYOgcCACT`+1$9&X*S4a=7<
zM}|O;N{=f(7)7NKc1Bw&jGUr&dHE@;>=Sqy;+X=33;{&Cs5YCZJBGttpRZfZQSlkY
zlVLC<A~HG>KK_1i^>EkRL&8*7cLAKZtANf`LRVK0H-VlT93{=!O_ZIeJuep*cnS18
z1&W$WsOu>(EF<(T(~TQ_pv!%AtinvaO`0@GTLVv-nq$hw#fHu~xGKct6fQL<mqyqT
zm=zTjS_5G|wKUcF^XD(;>NZ~czWeU`t!^3t!sD}N&;B;6gN>P{rna#GPM<zqTP=k~
z01!|F1mVb~uiD$&HBb@e1QU;$PklBO+A<G8%zeU$s*cK&$7(i*0t$`Cgc8efpOE7|
z%d&2QPC;+mwr#&DGoIu9sxX&ZLXY{VP??XQV#fu)PhOAVHO~_=EW<R`#d6HceR@yu
z(RrSbRQ~V4r@*HCN@;WjzR%>I6!gCPQq<g6azEnVe(^8x5pF!DfAyItK=E$`g7hz>
z{MVlg6s4|9AIb0J<Ik~*?~8GEM^!`;a=b%D?J>;lBOz{Y330o}?`dj}s_)n=6V5Ho
z5S5pRlhciOQo__zlTCPaX+GZHSd14ICE&RQ5qM*@0Z+~d#^X~XwN8z*lO??D<g5x$
z93AC>2kN<Ea+0Vv31Npi<JsAfIM6x}$Hw{!d;)M^V<2uWals8G{Y7;-;GUYkqTW34
z*w_FZk@I|>x2wtxdz%8WuO$qRj!VR@CO@sy;l|1V*xlleM>+#>WPA`F9_20yQvkOh
z2B)V=$UVjlds+tJF;R8|-l0+cI5@%!d*z<?R6FC2vVnL=pm|_a5bkdB!0zGx0=`(>
z-5P>xYyGghD;0ZZ7vsJKwW9JGaH~M?#swp=W#%xfnoy6$9YZjug<)<c#+RjFbV&+^
z=9*BE8jHf@U}VL(BRf{Y!bCSwevZhFbk=%r$2Sha-rH}((s>I6ST<PGGm$FcC?Ur&
zd);H^1Y)Azv|%FxME36TIUN;|HhwP6ruDR@u_GiU4GD?Kh>nRvSVTAig8bp>?Ex1L
zQE^@_aPe`4n{4@<&RxLf>FyyeiqF+URGg<a9mmDP6Hcyf@O1GK_mHqxK<PCY?p}`Y
z^c6t*dZ2Vj#cy=uhB{6`kNaA8hMZ9FQpRIWIio63KvZkvv4K&U5DaXLbnY`R%Q63{
zQ>V1_2OoS;D@)^?RF)^m^u{t&e^hhod7oOjgesEPrLxjri{}UkLWI{PaA*q5Z0x*$
z?yG5Zte@c_!9lgiscXDfmgUMUoGQk8>C8`sN+>GzNB}Uu3R^i1j$ol8rOG9Q_}vh;
z1RIZ87lFuh!j|DO?}6obT?H-H$9zm@{e)h9eZ97?U-d<Sjh~(8n4jh8>YVZ`K~L{j
zf}j5QcW~?B3VvURfBnVfXLnS4;@=DWE`9wiE)ss1)0svnGB2U`#by7Tz~pr$4F2)U
zi}+5!H*NHA1o!WQnot8O0um86zy*#H%5s)Q?>_zI-@l&*zW)4odTmQC-rAOj=a#49
z!qRj+GcR4jMw5Weh;uU}oSYJYC#Hqs*?A0Qqj6z=Do%C9;pEsTJUulY$3*!(F)>)e
z)M&gg$Al**#)wDY<wfy0I>rT$jrYN+N#S^KcmQrLb-*3fjyO6#0>?YUaeQnT4h-`V
zW#}wQkI(Elwk{02BqTjP(TtsSKDefI5N@q=6HxhL?+9N3ToBIAj>litS@Fgy6VA_%
z#rXvZI5RU^dk*=;^mrU@=d-_1JTlw|N7@2Jk%}Um6py{F{<xuHFs|hs-PR!7)e?-G
zn}e};Y6c!!RE39Dwc@^&qj1-X(YR&V7;K;4j&;*ouwp_JmW-~!?3Q9ouFXQ{kW91|
zr=#4OfKp2Yij029OK?GMoTI3|fyfZ(Wyc0!!mv6V+I=tPOqqr>QELR7nM>Tz&770V
z&Ia3W&*{|QWot%vX9b_-8BAuxCL|&vItIa^;S#zA!pqwio?hMpF%P&?%`tTK>ULL8
zXSfUWJlq6qZUQ}5fg9aL;O8QMb90w6p73+`gO`)!@eqaQIS9VqgT<ZU?c;$Vm6eyt
z)(yJu<)yj-cbSEkk7QqzjshG_Jug$H9_X{daBeKmDOhr=_Sql{3k$Wi_XvD84jQ3H
zcyR?{ngSJ#*VR8iV_7b3MSI|Z2edT8ipKk7Sp_dDHU%RtlE7>59tlJOhUZv@(4pev
zeG)1RCwa`ec%OX6r_S?w1R3|K9tk$8GQxy_AxNp<STFNZrSTl^iLhnegbtynlpZ0-
zKNf^7_f>xGvkoq!#eITm=FFKII9X1CoFL+7U>>IHf%o5mo^k~{y({R^^^oi73VKv=
z1RI^O>j}N@<UFBAb;n^W8t|lE4rcyB!rHGs7BGG)_x<S?7&D>{p-%Q_2s5ERI0cb|
zJmB1apa#5NeFvg%Z+rCX)ffGH3-sQ*Ap?KAAro(HNyA?@XXB-188|;jz&0rfCnm(<
z@o5R-@pyD}sHmk-fuT)QR1BV&Y7jMO#fh#UJT)~6&&)JR2<wdpo4jyrv_D>26fH_D
z7|%*r{LCDIUaK2!Dei*@hI-<;d1-iTT$}(e1iLETaZ7;%?ksb`&Qe$GsSCvI6+XDL
zCIlNZ2V-5j1GX2q;U>PgyUb0}y>WVqL7Ve?Vqy@U5D>q*Is?xyOVgf1o|qhsg95!{
zV?%JdD+=cY+~;SSMZG5Bt_E-1Bx>^7N>}Zf-aQ?0xV<e7caBfPo;l?>w5|h(wob*q
ztut`<`f0dn#UyN-KMosawqxa_7AzQDi>X74(OH{^5f#~}Oix4=$L-m|QEcF|Jx63l
zI3P3175T|wnA*{TgZJ&mn2~J)I02h95))OPK#$Mh*xSnIaO|Mqu`MHAQ+mlpqt+1-
z5fy>Jpg{Qe`ohCgpe2xV6PUS5XzJ?hra{b0Am%A)gr6HbBV_9-A+9^&rYSrDp1Ze%
zv4kH%=ng+odqJZ5f~1}>PbWl((u?(Rf+5fw#^4~dRn-e!gcOu@B%DGW0d;xzAAJAe
z6^vjY*a!u>T3?U*?9hmdi_;nijSu40J{8cAAw#sq2e=$O8$FHB^jMzHxA^RiMraT;
zZ2VMWTpgLlMoc4g)at`rSC0T=n8#;lv>7vI{1)EQlqzC*hMzW@O?#flYw(&Z$7`}2
zLr|s@B!nJ24`^zL3ZG#TFzT5epS7_r!j<<*D6%X=d#X(WhIv>QJ4aYI%TY1%^ASV@
zJ0Z>Qkoj2`^AXUkt*shhxO{$ebhK8EdH7wk9zsR|j^%ic_oveIu7@6Bt^7*RQ=sd4
ztXF&*;4UBk&M!lcKvSSoPS6o}1fRAqTQzp?D=I$1Pm~_d{rJVV_~D~(@XP1lpuMFE
ziGFTqiA}@M2pggVdam{k=+oC8z55P8f4lyo@+8dd*9&&<Uu(g;+f8_DlM(M-orPCd
zrQ?<5S$KT90Y}C~;;{+QINBMBJxv0lDlZ%ynS^`mLa={C5ROd{m`QkhO4Q&h5+Xk$
z;p5592%MW9jkDAJ@#d-+oSVt{vu?PrzQ49Y3W50eq!?|;%dN%ExUJYxVCSSgd)sES
z!_CD`xUJk1Hxzr~ngUPUTp?<#A_(_3MoCzkghx6OadwssPt7#o+?*slJ<ljg&y2(4
z!?1gpyVhCpK&zK1OK+U)io{bAa-SC9J=z(Edxi#K_sA&R+z@~}W&7~VEZjTMifcz2
za6@Mn?q4<>2e(eg-pw;{*SZ<FdF3=*y=Vf~&mM~vQ`<0qTob0WR$yFxAw~?zMpafK
z>ayd}kQIdrqYsMW-H;pYBH^wF^39Q0G@}c5+;Rh|r7>$^tSGnc&{jfMtw*-oQ>Bq{
zrUt`W{%`{WBy82dC84UTv$&JMP86I5HBoWyZt!yT6!(I+K#kyY7ofShN`3*Lhd|9$
zR35`wjzIVF@P?11`3pD$y*v>tbtlR-%z=JL3-CaSD7|D?dsw|)ksBC-&axV4uf3&!
z`v;X60jC}OYxlC$W8+h5jg5*ckLZCyrALLs#>7U!#Y9yo$Q4nz!pR$NyrCU)EjiXj
z7_tnH2^%UHp69bZ!b>SjZrM{wMaA>1mmp!|<$dwFT6}!Gmd=g~_MEbQ0;i*+Lu>pz
zR?h}`k4)p@qAI+ts;c@;J#rCI0!XPpcGjr(b5M;j@5YTAwfkbn2I0+fRB^1A_dzJK
zJUdF*A;R<QEMf00$J8n45r9h3@wuV`B#m3v!RMX4SLSCY5dTnc$$ZwK9xJfpg5YAB
zQgf`As;}a<`+)!e|MW>jK~yL7=n8y#==F3x{PeECM_0h({(m_y>4YAi<0<HAp{#)H
zccHBu^BJESxctpmqWb9Cv%N2I@$;|n!+W3OAAkD{Q(J109qNx^F%}F7OhSmg6P)_?
z7j?(5wm<sXb%(lk{pGj!;k6lfdn4!A8u9-2Y`nZO6|bz!z{@K#adu8J9-9;?VDrVk
z;eI$UG8Bi}<FK>J8#k0W;f{JyX05IQ9DlsHJOwW=x8RM{Hc^Vvq5wVc{zeO4UKlNk
z%Ln@!2H??=uDCEeTw5gcfuUa5S?PqktDUg7Nub#1C<?C+?rm@pHRmGGbHpu`-ngSS
zSX7=5?yd{Q@$n|9*MP^mM0t*hz{$xHUQ0NAP(s;<MhD~JF~K-7B?hOa#^AvbqB;e9
zdz-zmcbGp8bVTF+wrK1YWp{OjH+GJS!_g&Wcwlw`HV%!$=8+cMF}D$W*G|OFW#e({
ziYd5u$s}x?+liIaM`Ov9Hq7lDhAAU!Fs`{2BdT*ym!FKr+yvC8N1#eV+=?V`6vVm<
z^t@1!os9J>=40uS`N+!2(jdpsl|7amplpyIX^N=3w2Vwm0Y=BfARr(>03{)*7MhA;
z)1J#YOPY(;$>8DcB+zpdz&XK3VCU`Pt%a^sbZ!DZhOVwsp3m*vy##jNe(?5`t(>R2
ziwN*Tbbv3Sz1@)P?GBrl3rgj$V^x4V3OyW<<2(oz{vnuCGqhXjX)NPR+h4W!;PQVO
z^(6gQX+(M>oM;3ejSZg-Q<(x4P35KO)73r|jXs?XR+Uj(1zMU)W1}Si7_KVgIhOBf
zDvj4tb*VI7Tit_7Q}ysOF~8cX&&z$DSLIY4%*S-z8@GfF(+L)pmk{DU>!K5MG<D26
zn1}a3=&0+m4wh%#te5AQSE)IkXFiryX=<POuLM55+EcCv-j!SrKIMAg{g-@_&KVaM
z2|RJFBSRpk2cOzkN>2-iIbd1*tFHvAUww|R*mL{d7r?up;o31{&>9tuVUb2yode+|
zVJ&-Z9qjtTUR0jlRsCScMF{NW59FI`QbpZ~(p#T~zwgM$vx}2)ZoY)Bt26P&x*Q1`
zEqG+CzbK_(JUKH-+=Tmv`Qgq+Z|ol(hQpJ>@j$x=_O-d;>G|<^afuNpC;H>*S)q7u
zLn_`}Zomt3V{me;kGMBpnjeK1=EsR@3&g>WFg(IV0mk^tu^Wze`Qh|bsc*O^?r-tO
zo}t0Gvo1iwS`XY@>V^j-96mZWO2X_w+}G@j1Ea#Ud%0Ia>JwAq@zflnfLfq9Nq{J@
zzJDm+ncGjwJK!D(XK$(T!}V4E*i_<)>xPL+otuwcQ!}xmIuvW04Y+Yi1$M6(i#rIt
zr4w=8qKVizrxPotbztGd5tuc$852iTV@z`iM$}}Zsl<%>>{wK#1a;5HjrT%Mq#Ls1
z{LoUJj}=Sjpk?Gx7}HW<l8{zYbX0s2&Ju#jqWIzs282gN!QbCspeBlqJ*omdZ`t>z
zq9gDGeC}@Dz;kmStnGWT6T;P7%6P-wQ{2l79^T$?^P&5|&C?4$UcLf5Zv=?y3-k6u
zLP!8iVF5@D_Cca!f22FxV|b(=7G#^zl^lr*FK5&Q1Y>dYNG-G_3W%Pc1$5GMwO0jc
zFA2H~2^t$98zZ-L8XF>w4NRrc)V_ig8>BMT(Dg<T&{R2nT{JdMhQrDj;!*YSd6v2d
zWvokGLw`;|O@A$wM&q%%PX#Hx@tEfoTv%2)?~lfP<|Aa7Pknv@hWEr{!b$;=*Cc?L
z#$%S@KGS%P`|2E>>D;Px-iJz8U}GHw5AQ`?SAkFO%**q7=>3uZ8T6E^JbM2|{QGZ2
zRevk@Ax`j7@%`|P9A8#`1fB*$?hELYGsOM$vrqBaw*tK{KL<XM@A&;Muy5vUjEjp$
zc|atB`#4~5zuxHA#}1AHy}@<^U?+;tjvZU}qV~?uNx=(?)9~7wEWEq55Kqoc#N)H$
zaBhJCFRrlQ%?%lNa&{!nOb?ZON!prv$GU>?(AZ!+Ha!7Hr$pny(E+%<dZ2{65qNrb
zBA%Tq@LLp*vonJ5#1ua~Hzxwm&xysEu0WjX48Vg7Lx%-uA?LBKC@nmGbeyL)2>SUY
zNqD%!2M=*^gArl4r!f$B)cE3tqQMegyW+9VC>(4H66pDhnhV37)$X{h!ckLwoPze4
zgxtr+2aDQF6lLm#+lLIm^~L>hd#x9)6?ksUcg5yXZ>+EM!mVSHaBW*WmX!NrZBqia
zwrAninT@!8!6@7~uLC>gj=_c*?N~ao6?4Wkips0jR9;(M9)?z?p}sH)Rn{<+BzOz(
zyipkCj*Kuj6q+KmXAB)3!;o2!38T#-;7gUTmdiKUkeWgOS`m{N2mc^;OnAdfpye%~
z@^TlLiDL6`an%6G@YYq7ovVP39Tx7Q&OE&&gmrZn7<$1?%DH%ZYUA%*z1-pED|Jce
z>g6S{6SWuZ697X%5DbC-Nb>c7(Z>~5Ul&+h>`~}77?Vtq*i@Z|nd$K;l`ywR%1^GT
z7cvoc-BB)^{0{=VpTyazAt?ASji1H_MPnmoBhwojp(?}k`to|1sdVO7rqb9bX*|b<
z&DjPF{iyC#Pc6eeHb@$;)AL#iNHqO5SRYN@3(H*TJdMEM=Ti`(se7XFocjE1<OGqr
zCexXZpM~i(1u7cr<UZ@8t81}5^D&KOnZ{%0=T<q-^Yas2%*QS7f%{7NvCO}N9`p88
zd_7%3?BBtzXSr|1_0YQ{u>0{_0r&UvS!GM`5qjElJ}N;fJdW2B;8FE`^64k|@XL?!
z+1H<oD*Q?tr+44n*%%cUjWlmxxb&Bh*0B$+a_9}G-u>Zx)d2Lmtn~W!73e*;AWJ}#
zrS;stx;96E7mueGCgbhRd3a@w4S(BFh_^SSOX!-6*H#$unkcv9UA_WEzB$b(zzf8&
zNwL^1VdO&sszdDoI5Q<mLQsDk9_@y6GsE!OG9#X!pNNAaJaM4K3-{K!YaIX@LFK*@
z_qT*%|HwcAswlz9L3mr#<_k-cB$Rf+UA2zgDz4EFdqkz(T;hOT684^-Wx%NkQP@@M
zfrr{6B`o&C?s^YAHa-%&8ys<Su^sNHyb90Ejm6<HKB590aZ`mo_6!fkUCp7`EYMq>
z>x`v&195e8Ft#>_VRcmywhT+c>iR@%7@mQxV@t7ld^I+7)njF60~U1DVEXWKOdM8%
z(e?Qf+Ge7@$bhQsXp|WJQ4r&SvUp$QhD*p8=8np&WVE%`Ah#d`@#aLBGP_qlF^R%U
zw`L&KY(ZQ?EP}&=wF&;55=NEBP6iLx%Rpn#t2>98cy!OC(8l1o3EaBJ+%cV?bo2Ir
zlK|0a@Ibf>c942p;4k0`mryx2AW-fn0!1-#$O;WclG8xM%U?fJKRYD%wL`vZf3!#Y
zU`~20Mnt$H*SR+eL;TRy*g&rR20Xgx4-I)54Rw%WqyHm-{gDwwDjyp&k7;b|YM;h>
z^~W^*wP<?i==YVWG?wdmZ>ny6ex|8w=&#Q_3S2Z*mjW1%>CDf(EK9(sG?pb$)HzkR
zf)|guWg6?GvkdE2FryPl%u8q~AgX)dIp$S$>LI6&>8k9Nz}FLc%6o!~{%6p;Qhw#%
zNqtJWT_pSjdOv-45x-oNdl%@bXMEaLTnlkITjR6t1qr_V@(X<Q#mD&a`|jt2zkc))
z4$PZ_DTXAJ28ANt&ky~b`=Xa)Z&7^x;dqrjdfRm`Fv>A|cCW53#`#4yJkk}8H#X+u
z#EfX1Ut+==8*=d4+AO@cwFqym%f@>fbMfTVFq{<N@tNHVOD%YAi4_lw@D+s@ix-w;
zWA9K2Ifwh<@hNe5q}^AJodts4n(BIUb&h~33imhrNqPwGt`EXp61p-(-QDDhUG?^O
zbgUPiTM#Eu6*V}*OW^0Lt&Fm7cmVFIcEX-|C;UzB@rlV%*e5_dIl+L3h6my9A%k&G
zwS)Fzp*t(=1g?%aGR7VIhY!ZSky3Y)vp_Ep2gWAg=ITIfEA_<2au@6nwYW``<P9U@
zuu|0CW>I@vT2ryCCK(HBQZQ?X1+(fhF{3dTQ=9V8RiA?qWvOT_OF?x`3<`{X$cgnp
zL97>YBR!B6?g3kb4~A9dBPY|0)Ql7uY-S`{Q(&}maR8gPUY#+?fPesB`1*OmQy}Ky
zE~?Iz-VHoHqoezHOUNsstFyq+O_UqMRwqwSxJsz%?!_r(PH=V^1pBM(v}rdk;{Nvi
z5$@)Kw5SM}0|QYQ5ryK=Fj$-&k?A-H?GfRqmi_F3eULfO4z)h^5*GNNGbsS|k)Fs7
z_r;iD0zEM{ae_~4>TKie)c94L9N^|M5LBa9))Ovlth9fZu8!HL^;>1?dOg!wUX|4w
z8?`c??^!SFQ`aUq{>WHXU7HZ0sd5AbjmHEFx2#{~SNFg)l~)0ct_K?Tna1-fT>(v9
zSHbCr9|*QT_Dh#IOyysG@gf2DFDKkoT|EKFbIKI}uLLi>n(GNam3F24S{^>5<8wWM
z-VXx09|d}v%Ra+g73vaxe8$HP4K+yl%P&61rvkmtzx+hbe+7K}86Fzng;nv%=nRNK
zp_@PaoCaX9OMeWs?~6hG>@l!cfAkaZ_3PUgc26%($MI<icw<u$-n+H}&#z1qIQZhs
ztau!iQ1Q&Pc)YeU8*i`8)mB?MH#1xy7KJBg#^SL_QFw4<0FI82#_7okI5;X;LRy35
zNyh#bA5mxhL~VuQNM{6|n30TqBZ4Ha0lON4aC3z>?r93a(Jm?15h5UT!NGP<oS7Ob
z@bkfeHZL3-FOZ!aE70>1=ncf4hCw(tBNB(&JVfESYv9{c=Z1Zat~e}^Jl^hwC;2>a
zdZ6|i$#ZkU@Wd<$ohJxnhkD`uwg^#s;kdTa8+#`t;l7E9xUn@9*SADqMX`jwb&<Hb
z#e}sD$yi!z#PpJQOejjgxS}MCudrZjWh$C;<4}_kiSo1%SR<W~9_|clh!ax$2gB&^
zg!+6dD)MX+#u{O^nqd&&8Eg{TrrWfhT0^1%(GijG^Ys*{ahx0n67!jzD?F(5MB%x+
zdBWMr4Q}q<64H9ZMUGvxfyG`D!ur6|mvd=7Fxa6V`U~j1dx>)EHvkz9PB0H}KwSU+
zFi4#?e_s@ZgdkIZY8lWUH4;X5c=)1Lq<*clGlmEIqDItRM~E|~CI+A})D5Lcp=fWe
zkyhNzHlj>xzT^ZCxRr!|{L&2$HcXlx8f^Hg;a<rU5L8*_XJfmPsk~~djK?ZX4_scC
z^{I3EeVV!^O~HwIXzCmRqadR%%eq-ceLiI@r$DM;rr^cA++V(Yxi(vZ`P6y+_rts@
zoq2k?f|1JC4L5$rSN2Pn2)^I@MNv}S{(FVj6MFQX(9?rYzpp^{JD~j==xHvnqS9lS
z`@?qvI(iS_spor2@iENp_OB(s)cx5f_(k&KgHQ0%idDEdH49f83(ysmh*W161Pr!^
zn}a<Z9UL&Q{{T^WebI+K{f8z+<H2zuI5j5;&##g&c5Nn3OpC(FsWCVuz&k!6Mnc6@
zfl(ZuUt|>cS@7~Q11>Bw3lI}=s6(JOE&>-8qzDX?aABbtFG;9;cyy#dN>p7-h^V<J
z9G_&s;qmb}G%f+VB&@u>)(`i!gyWGhu>!YPJUS*sLgFyl`r&~VQHY~`@#wffJS>Xw
zK2eMhkMhLvPJan)1Mv7{35#33a8Io>_7C+C&_&`&E-NKU@r7BTczHntPEYg@=!M|X
zE-xIO5Qy7FCGHv)fO`dS*Hua=J~<f=OiI9QBSLZ8$S7>C_Q&?-XbE={v8pB(^U9(y
ztuP7`3S!Wf8IG3pa170eK&8na#fjd?jdeqMga9wp8JVJxl6(gvD?SvpMOnzO7+^_B
zhAGvgE$C^q3iKqrHKwE>Iyy!hlH%jVP73GCofO^Bb7x0}yNAF|!dg!s4SJqle(?6;
zGdcmC0MFOgPuvIIp04nef7GCX{ZQoRk7g%tH1%~tv0XouI=R9qf8p{2{m_sQhwAV!
z<hnScPU`J+^v1fF6ig3_LW`p_h7IhE2|j}{Gt>i<;sViNjKZjzav?%D^)556dq*8~
zGeT=9G(8yDNUmgTxXh=A4vp!`c%H`!LNw-QBUh&1r>SfAEJss-psB68hJu#5e$TSX
zn6KwOD^p-p#%rj0xaGCfmQ(Qbvp{IfPp~kLx*zs>@|foc7v|@8%46;mNZkK-&{KY8
z`27oT3V8o1_>}8m_FsUe^FPX`dSWl~ncg2jPwi{R;#7s2G86#*EXwa^33)Gm^0_Fz
zPjO+@D%_P<j9XJnu{kLp)1pl%_Vh)BqZ0;6^kXOO#BM-8*b{ntM)_&b+dnoCFRV<*
z3oFv_^a2A8jSa%%)01$lD@IhE5vQle;f3WUynjtP-rQ)y%PTE7IVDD*MK#7Hp3)^>
z98OFM!})n8oSc-Xbu{d5@|U^<Y?BjlRMPed0PkUV+Z2QcMn+=)2vK5V<8X9LG)@Zq
z4z&8<-a0NC8ms~I@K_0LYwdBgGXNLn8*pT_4;~&NN>4)8ot6CsdaihWP6D2h(EI4f
z!FXzdCtg{=rRD>1dYUM@HU}J<z|R<jdq)J~&gLN8))0W38oY6RgC};>dg7*GA=p+Y
zVQ-@-zVaZ<%Jo54x)&zpg<zt9Z<xg&)k$8cPWD4lth<DOPOt_IL{@|gY+<h2D#_`w
z!Kf02mu^ag&0^A4LE$?#IVR6&u^`#ZnH32L7UjhyoIE++jy<v(@Vdv>xoN-?rN{Ag
z0yq~Jg3%iR-hSGUlK=^ALj`ytQr_Ri5uwh5kTlo<L)^VE#WetnJi;*1)epm5+%d%0
z8_h{^7?zxflAr)&3smc*o(b;3SRQ7?8e<kZoIEgILf%DzURWI!fW?X77-@*atQHAx
ze-;=BPlyx^3)~TBljgCu6<{dnDAmHTW*lV9P6fJ}0MC_6)I3|3V-G6_^RgjuATP(!
zab*!zP7V5{@qMveE{o|L3&+X+gakptv3DHs%S8cLmwpu&4w5DyIE9U42bpHITD6yl
z)6~0hxjG8-a-Y-6ILm>vGT0N${A{EIk$UefKZm*oU)RWMa6T^w3KJImPWbsa){o-~
z)f6~2_K%;9z~nq#HO{ZJv{airOk?K^KNp>OIrffka#Ni(%a$#Z@?74EJ-b)-KZjp;
zPjOG+DOb>=>%sTmfnU$_$_c#h1YqAQ=zT9CJHy<|;OiOYav(EZo5dla@lO)+{)1qY
zH2IS8(5$K0l#z~^A#qrekd9@>OmsyjqcS8KF>YL8$Q=&SW;_QCg58<<CY+d&i1Uln
z@cQ~ZyeQB+D-d~pNh;3HHE9cPa{Sv9^AqsY(kT4(W(z*sm5rCzCE;+FA0D5Xgy$Az
zO89D&Ff|y*CWPVf8F6x)ga=x^MKQ)|z&tiNQh*mI$}1T=YXv?6o`*W(1Zqj5?hFF2
z2nka|v8UEo_5&nz^~H;eY!V)`r`A=%XD$tI!v1D2+}GeLX)XfP!Pqy{5syy@!uhG9
z6x*Fd?K$JQS$=qKz9_%x-Z(tb6AyLy;_%cc+$tgQwUsWoQ`F~84L-QG&J$Nxd*Euh
z=7uU?tgi{dqGCTx&+)>@<iY5$`e0OgAcn-dpftu+peHIXmP<$tM4GQXEPew->AArW
z=!){3R1{`gVKF8lQ@(r-7Ul31zK4QKJ;f&^A}l;yTNuDi8+a_rPPr(&-+(7z<ID?o
zQg}&vkc6ZhZW8R`ga`p-lAkwBqTq~vK8PM*kL136Q8UOHOFYA{+ARe0JOa@v>Tq;i
z3>xDiP!}78n)q1ch6W))<b9r<Ju2-S(CFZa#R+C?NtQn(b(J_jYACj)#GyUR4-4w6
zC~-8p5pL8-+!AlpBlK?6b(;bojmsqw3~W3E1!q*yIQ5I*QF+w@0BksXPX+fmb&L<D
zPo6xffr5>l)6fVD&T}O|I8d2Tr4ckN$3edA$>qRb1w|TXQ!v!x>LZ+?z(&kvt5}9J
zHu%n6f{o=bT)3dYna5OQJkNV!UQQQNYK`yJc;=aBw9m%@#4#~38pJtUgV3WX=-J!b
zYu6{>ICxo|r*WnT>*gvjybkN;f(yJy4xZ+@g$oyIpJnml#oBkr0l%v3zk{CM|0@8y
zr=|Uw>%m9YgYQ3q-=Dcw<_gf`Gd*@_(DgmJ>bV{VEo;yA<a_w^laKJh2Y<tN-%5S&
zy@R{wPQv8$Bv?J2kRKd?ammS;X|iHMq$s|@eu#JRg7+XtM2KjxyReeW#V6vKMHX#P
z@=MDkd|jT3*H&lYl~p;Co{lF(x&7t3BD}K0f)96R;ER2^_}eX+5)K>j%#t)bImd!i
zq6#0I9IdIumsi>F_QnFdx+)WIZ^*@KYiv@F0Z&Ny``Bb7b~OpqMukWiEon1SaiWV;
z*dnmEE*Sfpg1Z+a9T|WZ1;mHP`s3b42b`Q3foJBJaG=FkTUq(QC~usY5{gGhd*RRs
zC!Fc@#?zApb~F6(yr{f0(>(ClR1X}KFn8xLd)zhB9b3!#VMnDi?iwD3JBEki)|Oyg
z-xP>VRo+-r;e$2R!B|-8kJ&|m7;bdJ$TV*Zw*{ag&J}rKgHgm4lEYn*6)Z~6#~!J^
z17QhrL2+6<$_p}JOEYWh+nEJ&CR2*G$Uu6U05I8v=*TE-Id)f14>)NvCoVs~y9_&q
zv>aH>o?1_FA8$_t`gqDek{2Q*3=SLIAF<8@km}=#>_8tR4<3y8UVV@=*b(hM0oW3r
zge`tiSQQ+FxslPxA0RLc4?<mhG%Q~3@U`y)Uja#;hd;9IT@f#hYNCMnw$ehJ8QFxV
z+Zu2~S^{RqhT!^f?Gn35=t@8d?G*5+@EDvCf<OO5(|?-jp+}`f$Z%>DVL<TFIH8_u
zO665A1!rR-kf?;1pMZGox#zTVY}{%Zn<}T~loC*CBEGUQW5#H+CU{L1dUC~O0+diw
zAtaBvAOPW|U$B72x;ZV4#(H?nnH@BOiz}`$olbB&IXP*KnCeW;=cV#fpHJNfp~myP
z2Ywa>eL|6Ssn12=5a!IwedgsX7Y=HtiY7Fe|EHhW$-(D_SN1=LA4B6m2H%y=De(R8
z!tb}dvfmSWKMCjvJLQC*f*yykXakqUzu_N}pMS&`Uw1FG`pc&uWB=0GSX`6=vqL`^
z`uD|TQP6YER&)g?qS`qKdA?zq;*0k4h22y0<8Yd5;?0TImXhK)yjNGI;^h@KQGHo>
zX>}n^%rWAnt8?(yZQ1zdXgR)lv=o26H5+ek%Mk@=#G$TmoL!hK05ss0H7R&k(m%Rx
z2)?+vL4)2qn{)BjhIIU8Q#Q^_kH-<o|I~tbJiF8&iqeA9QzX0{9fc#KB5}Ga4o^;%
zb8{2${?=?<SQ3L>jdpmr!$lkT%omOCX>`QNX>!f^9G2pVJ@tKXwA~pm%n8M7i=#vl
z2H=zc@AMphJUZPE_ej~Bng(H0Ngv!kOv2?6;kc&88`m}X<J!g`tSWVvFU(zl=ZmSi
z9+;TpgAo=t3{CMwbE-e8l6|zb@~i;@IlqC(2^H}94?<2b7a0h^u!=n7W?GPH<r~MW
zNHUs{VouXur<i86ASosuA%Q`fdgJm=gGJqOnwUFfw}h_@XI&+HbrY4xb?|(+z^8``
zg1uZ3>NXfrE(4I_?S$MQ59IsHH9ehC6B>?|=mfMzM5Em=2<yTVaI4XVC7}^0m&TLT
z?<$n|d!a5N76tJ!hzRk6tIJ?`^zE<p^!oPijWjPO%oEVvQf$Rt`9|!pL}I>#>9>rL
zF!aaog=T^`!bY2l_IqbzWCK^mhR8Hdui_LhDkds2E)>9LQB*=yP&77tPLrY%Vguk3
z>TK|AP&Bn#GQ(2BiEHNZJe`o@mOpk*L8B|saJm<-t?r9kuHDD{Pd)XNmd1PJF+)o=
z{Y@=f#m249F(2=f*XA+d#SmSu*3|M*Jg1<pj1Xg41vLUweSUr)3X1%mxTP~cpXsqK
zE-*l2Sq{MFVg&q7IJJ$R<=;V1`IW)f4ZSPD?T^ywS5kccEAUh2RQ~T}dr^S*XV6n&
zF5#y+_r))MMG$r`S^(L<ux>35HcGS-XT#cX3)UuDF+C~~b^f6!4~ar$OfpiWorJi#
z!0z3v^YE%DvA5O<oHmF$Tak|E7pLI)rB)oB7>yIt&Cpg?UY>?$x7hI7t?9V1!-Vr|
zEqG#)8K)!!etf<e=S3ktw=xCqTwQ>-Hx~#nbMfTtR8e?@XC__}_4mwTGoD+Lj;H1t
z@ya?YUR^JWQm*~@v{-F8%ZbitT$q=NmlmbrnYoELF)<KN%?`tfslGTnFI?)1mhjmd
z?{3M&JKM4Znu*xkItX`G+2K&zAUrK9@5!k?*gNzpoSNo?XO_j`{K{mU6Nv8|;e*xr
zeQ;ZAFs`ff#l~`132XguO>-EQ6?<Y{z6a(M`l2h-17kBq;iY<_JbD01Vg{o!$rGjV
zp2!GvfYH4VGW;Ep7wnGA5Kq+Rq@YAXTU)vr28&S}glw_#<vZ!H%3m5|oB@%+Ves+t
z5zx88>GHaEZUR2`)_RG;;~-<gkHb>jeZAr1=Y<e|Pel8=AkNhu={}AaVUEU>95dRJ
z!%-{nYd0idXm|vwr2M#mP^^tI;6_sh7KKD$n4=@od)Xo1(+!mpQWr!;!4w+-qbU*L
zavUEQ2SZXEQlf%j8FUp!hPY#?IRdl7y)ZLM)bDU<a2oVP*D2^p<C7iPUUm(B-9|v@
zjmP{^Gi+o-rm59el=4xHo()W|`e+P0dCU-V&z?QnKI@~YjsRYR3P{0$#t>S8Mcosh
z+Yu}TD8Wc$eVmxj>kuRic?lO9Luo=zf6Ql!tWQ1DQ^qy>ST`Fq!)^jm-5-@4%ksWi
z7lF)Yf$F?^=Evt=yf4<z?}l5dMwOSap>dz#J$r$9|NklU=sm&5(D*-rp4wN(f28)5
z^Rp`N2|PN}_4yU_wC8yYZ!d?qf8<(62lGq0OW%Gi@cRn?ko&^B?@Ls<0edn_@lbXh
z?n$k{>WCB!b@oMk-vL@@M#LZ|c*`Grk3oZB_sZIAytpzQ&n?OV*U~%E8IOabBLqxl
zoS9|9$?0i0FeVXCEzHF;D+_Q~l-j<o2ppbD;AP>&Y%88$TZ{{9^6~P9Vmz}vABQKI
z@#rKg9+{Aaqf>2oc10PUSy3#ZrU4I(j+b-;o|@04tZX>HAVphh>eS>|33Frc)HD-b
zUYv^uTLZDH)(sDi^uw`9p*S-;TvTN+UfZ02zu%aK*ES~Mwe=?KlhF3=y1qEjHc-@R
z0G^r^i2H{1!@+UxI5|r`hp5Bj0%oqLvNUTTHV^T{@`8a_l5dYKqV(1d@yDzTXH3uV
z#I)=Hj7jxEtI-X0i2}R00Vs<dfGUG43L_mPtn3e~M<1klIv_L12bF0lD9cSpYFaXq
ztVS3zQnl5PxkQxBVuK~gjM(sK_<8$vhq0pM7~Z<LyGRJ@2yfRx2ow+odHTXz+{4!&
z?!f`@2=GK$pgYWg63Tkm%U>%;EX+>E9Ub*pTb7RYSU)t!`=iKf5bE9C1y=D`92ARH
zp$S+iim%G9H?sTmM}eCsnj&M-YA~ZJF$w9hF-SI=5MwYRE+HA|T%0sE5{1#BXiqhu
zGbtX;Qs$P9aRR*`glnRhF0+jrE-%F)%5)yFAt+<RW#i_GB>aKX2o`ow@W;-OjmnRW
zn#ziRV54VVwTLHI8(|#}J@k-v%y5n!1<Xe<vElRi*Sqh&tEo7GL?2S}nI4~|a*{pk
zAtZQ@2?+_mtxC*mE1*(Aai3wXG9FW1(pU$NxdIIL2|X$;LV(A7*2d>>Y`_E+7f_%v
zT;_eTtO7aJ8=p54Y`j0JHJ<0^<9EX6b!ti-|3DDl{9OFZe6B|Y$g%`7A0K>2g>>mh
zInU3|UUBwF{|j_fbW*Lh&vV}iG?=d^^c2vPt2ADVex*EwpB`R4T@OD6It5FXrC+)z
z_pe;if4G8Ql5!WNJe~XM_?vG;;r;lfgts5!;ujwSpL~w9vsdC8a{<<-WMOk&ITo98
z(Hszq7-wm74i0b`)E{onP8jIm4!a|hBJuFJVB9AP=>C=n>>nA8107L#e70G_M=Kti
znug<3M2*eK#gWMt9PCQQ!(B!k7;nO{nVC2>FB?xUFUIj%Hk_WHgR@J^aCCY$_K&mR
z*o<79Sx|y!SJ&X|vI;yjF%|c=CE%XnkvPWXp(KobY;ug&*>QGuGEPp4!Q)es@XC@r
zoSK-7T{WKC9NU9qLh$IsU<p^l@zMqZ&M%L^i5cE_Qo`9|69Vv1yPGIGZ=BOQF9Pu3
zC|BIsFaUc-x?|rsf9x6+fNSdlFvrptt4dt4FxMW-N(N(XwHFrTyQ0f95R=k8F(oSq
zqf>p+YVyQTvl|-CE+~%bi^2%HJxZgUk>l@x3}0syC&Z!DYC}$@4JHmi7NwVz*$ur^
ztAw`ZR2X6tM6m^FGb^|l0HLSNu5fpTr#BZ47zlrNSA+-@{k{COxw3rj7cSs4hX*1*
zA^;^mPN){R%}r0jO~b1&BP|A10fUh1KM-YMo@k4Vz_h3YERIRUask|2scW?SWy=>(
z)C2{hSwJ^NAUq{28x6@Oxu;kpbG)E24GHndNHz!zO-aa4j760wz#-m&SkovWoHUMZ
zMc4f-j~a<HZzZVh2A`Z`W7W=I4krl$_I#=yN%n+t$t9+<bHZY=XfUH9W6vb>uu-d3
zj@g6B^8^<gw^C+QTB=S$ls&aHF0I6{PZ@#B9$-GRV_wx;Ou!H#?EU1D@my7f094^E
zLvVfuejeswZ?O7IJkP~RX@n7<t?_d)o$uy2cI=pTj?WBL4>MJt`h0Bk{ESqZ{QLws
zLwlwX(yWj9xMDPWx(OP>oQr|dcptnzKR3S{uE?@~|9$}xXJT+%9|wjr6#p0S2)SQ>
z<$&#O5bBQ!%I~SHF2k%R(Ddm&U7t@uOz%D6_h;S>JZ}HkzyFT2HzbS|=f3i9zx@iI
zef=rE`sP#o^P+^e^6!3O(K>8Ow4pOF2=h}cxV5Gg9T7%E2=v%dIM`(n92_0sz*j&X
zn;MCulOwUeJsc0V$KdezL_9n;8c)s@B_^uv$V5?mv$JtVRNWy_YR5$l9-WbjqchDo
zy)Y9`E-4b&72*ETCOkMk4JYOn;E2HR<N^WS;(R=@v=HZ4SK^7qC3vta1^dUF1%_!j
zGcOrWFHXi&^Ad4>VKQD=nu?c~iE3P!f>)MG2rWwR;gJE_(pE=0BeApA89VD-@W}XR
z>>2KjdzuH~#FSu>wW)Y|UX+BsA)*Lf1>6p}r_m8NRrbT(!(8QhzPNX^53Xwvz@_)W
ziXvw$%5%V?d<QHpbi!1tJvtKmV64drV=bN-Y4Q;0d0<$II~t5G$P2MUR-henL;IsB
z$^|*W&d81MMXkWDP=4fmy`ssQf+PW-A>GpbLLd2YC&ecqA~+OY9s)ehwvaELQ^(w#
zo#E&>2=>kn=<759_VN$m>gs|(4==<B_{=`R$nXnAc~m@V1e&!1t@^kqj7l@0-I|Cj
zuK`H+9*EL}Fbpxoz#`={2lPdUguE+^5xCkCg=xM6G0bxSMkEBG)+j12+lX0K0bi;W
zd2z8wH5sL`$-QUrwUK6lOaiRNSY*m|GW`RvbY#0o1U}EBvg_`=U{K1bXp(pthL<Iv
z7@@{Sd?n*K1xN1F6;Rl~xv!QsXTwtYRUU5ttbR5&8X=}k?JM=EH`d28Y`DsL=251<
zZyJ@Je%UOZ*WasxG(T5QQ}?3s($v1HpLOwbs$=f+JE3u_zs@hebc2kphn@l);l|si
zviqLT$K)K<AJb_)mE8ZT^ZQe(O%Fc0K1~lgz52WOy{NYDMZsMGc66T804E_VT@OBv
z?fd?#OZf5YixBxI(EBT1T(Sgv3Jb9=F%GvC<l~Xrp;(+~L5!oLw70?V_6>lamj^<f
zu7cf*tJAf&pdFi>grlOaPS3R9P-l#Q&VsWGvhcvD7ztI)I5Ep2aI@m^McH_Cx(TNj
zS@G2J9GqI1k4I<Z;NavOoSY|N?Als9zo{0lTwRYBHdo{EWra9CF9#1#PQ&i@csx8Q
zMe4}HU$&Rxy=#i`^4fF>uLX8XQt;wZ3m%^ug_9G*v_(P>wfkxB<KX)(ZmV>`fl;wI
zJ~0hD1X%Ypx#E$remFiM7)QqY;M_b>g42CPsk-8h+CjLXqCd8m_QEw)SK-Dc2i(~n
zg6oF{VQZZ~mJ~W-e$D{Q%N>Mi=?)m5;(*a+M~pByqB+qKLyfMeOBjT*$iB!6vO__5
zKcolr(T1vUs#&=?4mCMh$P)Fz@pvYyMVooSg#}X5x|Kk9NT`IhK9|SexoJJJ1fG+<
zJqB`#8@`LPpKSZ~!odFh5a!_yn?SNCI1(lP;i#67b*TKK4Kt^q)n-Gr{A1>a`XODE
zUrB5T3goyz{_58Ybh9K>E_SrT^;M}jJEaka+e)!8B~)PUk6Ke8+OiTbGQ<z_Y*vg&
zHNzSbgrvYgSOkD6hIp8g<6xEWHZ>_0)~HA<8b3i=oHQ_j9-BQ8!lt4%X(q7kalLr4
zp1|NS8?OR@+A2V(W9Cz3sr0B~R9LJxef@0o1cAODnjRzycs+qayAq68mwrp*zAC5E
zc|CSUsFjmx`ug<HrLjEADp;#})P1SXqUzvz1!8qh-2<;jXS#yUKlpDWRVwHac69Em
zG@42$^b`!K^c3{ydYJt?*OyV?>j}IoK~KSs|HgC$y*~n<f*xU~haS`U{O{Kv<THOO
ze>5c>pM8jvvuEHyV;%0PtibN-8r)r0g+?b2L^(JjGB_Nrj$R1&^+9`iGVK1ctq{+N
z!aCF$jl*3DIJY1}1Kv~fGjVKc3ih_e;>Z+>gs|Co{pv~yRf}*&!1v<jN<6o|97pG5
z<Dn_(IJc?}uWlcSm$whYTQ|w}nxS~__OW>Nnh|(%#SokprT6rz3cR|t7O!os#2Z^n
z@zUCC?b`2bEy256itvUUKReHW3k#C*`bq(DXAq8$55pM=c^?~Zz@ElX?5qsL)!74a
zPlJmnKnZ0>ddW5rPtFR*xw(-NDu>~Lk)F7#$r;yG_Qlq+tFWo853X(W!8J{R*j(q0
z_0^tOS>}!<#U7ZG?~Z9X9vGkDff1ti8WSDRnB<7+7<&{3+o3SD56WVkVDaq(Yp4r~
zOp&O}wjf`WlQqSRlvDwqs5`#8QQJyr784x}e_ua%NjR%s&coGGoCny$(eA45cXD2S
zAu(T5=VRX=30?xdKtJS%1fw!CO5mG}ahbVjvu2=G6k2&i6l!8(&}KBEHX#9NkpikX
z19D^IknQJ=LMJ<HD>veuH4|~Bs}>vc3}_WqTI@dvbuqrk8DNKT@e$|{HCOE70<&Eo
zM9O`{36w2dox+?f;4>pz)N5DUC}~a7XhhL5r2SExO<HS$Z142TEKv=cTYV!{jhv<$
zx-$Ja=H)XzzVVCC;1n#F&V81p_k@Zvm50{zy2_YNQvgzR_spw4hu(Ny=H-@QCl?7-
zjh?3R>$f~lQ`aK|^w#tHQ2AMw*V5mwy0*G!I_+2a_aVfT6LQLVO#AlR?|-*XrAO=T
z1o_wdd)4RpGgtZa`L6^$I@9zDuP6BQO7D-rr_z<WQ^3>Ci@Lk?v*h`a)6pcn{qjrP
zH+wD?rDb5LEeCVN$EIf?)jJRogZ&WV6a+(v0mG{*v2u1j>|S1*fm1URwAm7;XQhei
z%g3`zb8&u20S=5w!b9Uscydub-nym=AKx(yuWzr#n*y}AuOA_bt{%tc=i#C05{_;e
zikGh)iKo`r;^|EdII^f1PpxaflWXen<jNYHUt1%ftdnw0c=v{SJijIf&x*q2IKekJ
z=i!YFS$JWY31_B-YYPsXoD_^_<|g44&h(g;gd-Ai9~jPeZA9YU1|NamV4RpFYOsSL
za0CuX2z$8G7bm8MO9&i<{bT%be}_Nr8ykx2>OHWo)CJ2524iu)JvP<*hzDayi8p5C
zx?!?FuQSUN!&975pEwZJvHdY5svk;1d!tCg+Jf*wNC_B-0%I7;b5fAc;VC9jMyVF0
z@ul0Mps4T|)+S0=8yXZ0A8$?}<C0IV80<P24g>pPP_L`B7ZH04`24Qwg8%_vWUqcO
z4R(gr(;dl94oLHGLS0-0hR4NVlGHgyLeX}!NkZIM3BAptO0B4kjF(U}87c7w49U(x
zZgeP8o$W9x&J8QGB5`$T3RV|cG1T83#r^xB(cKw^a@}UPL1=Jy!sy^240rNGu3cY5
z_3DiTe?R2O=g7&-LWVUR9U}-mX)FT0pG4Y;uF#(3X`s_VkBwIk3DsDYsdH=~G&Wd$
zgV#6Wo?uYvH2pQ$;FR%tsy=S@WoQZ_SE@^&roV5MuD_1n^!KUC>94J}Ow)f>?(3ne
zuV0nnz3J~oU!Hk*%yQijq^jdJ2)Y02KmXia51;+<n4qIN<P<cXQ_$0cPw#&QJmq@u
z^@QJ*pr`lmzxx;PeuLQ`xIW(xmjvEFeur-^eva=2dc6KmpM8V77p+05dmtJE!qH|h
zBP%Q%2|huH8sra?S0u(()?rw&4Na9{uzN~W)gux@o|%=3C+4P0NXx;<vRzn&`$i@R
zJWMz~D+4cWD#BaWRpO;BL-6{J7F^gc6sML{;^fj&JhrSD&+QnBXLq#XiA_UsZtEx<
zThWB$%NucQNj08Y+ko@yYj9>+zQC_Upjm^LHy7enj^EoZesw0^-fG3`8%%g<Wuhpy
z2s|}266fb6;JNuGoSGD+DZXQ!eC1=f)_HMwoR@^VZn(QvLfv{N?K$55Hon(31P_k(
z#jaL2+}h}hTZj2!>ktoYsPM*G@!45@1a?kXUha#T*{&FG9gHzH4n%fBlW`Dg;_Xox
z-4|tHy-^u85c$FO$POKhtXLmZ<d{&9!*|S@kSxkbdtYt3ReR?JS3gclG9V%}Obcrn
z&U$!zXm2UAAKYJC&~T7|$47t{Zr2Nmy?Y_OUw;(#b-)luca(d1Bg@qhIbM!vi3!4l
z$RNy1h{UwGP>hNUz_7${xz7}gh>AyZRFXhA1WA#RnnKIACL=Y*8%bWhk>=hDx!w}8
z=VoDecr2!RL}Ej*5i<sPpku%QR1EBeS;ly*i!-9r)emvJA5o#DglBp-tj07<7~Lf;
zPHSnR@<h34F|9OAZj^VoQ_Z>?;lBVxQ~L@sdT=o>pW(50m!>~QFt9%Mgt8Y^ufFs^
zq3HpkjEzy{<M}@-tBw_j^gvRk&#Us&Sf@TszoqlM{<<n`rK!9$=HWHC?Rj0^kN*Do
zov5;^e&+ilr%_SS_?=W%R%(N;c`rQA`ssfJz5mn|81?Y$3BW6XkkC`=?T?_R0Zv>U
zt9(CQx`-dX{t6dGy?!fS?RQ`N1J}%4iyY@*l!(}9Hz%P@+GJUHG%CUqM93JhWpWGF
zPOZS21!b^1yTGh<4xE~iCgH6Ck4{X$;{v~DmsJYzB)sh~<AJe>IJqzd=T>Lpsg;Fz
zd|5f3THS=B3nbj#){5t^Yr_jSjm9(AiC;enFWfR4CpV16<D11d49EGct$1b&RbLq{
zY^uSV*EQkeJBN#UEXH51%@^l0K2e#gV(`kEM7+Mnj8~SYh|){O`Pl;9&IlYH9VqHA
z49_l1&_dbM(}D$ZE;u~a508uuz$4>0UCtl-M)~0OdPnRi?~hGI{jjydRaB2N)|3nM
zhWKMfdVee`cE|J#C+%5Yhxn*eM>HD;q9(CFs^UfE#oCL)>y4^7XXJ)BAUDbr<>?8i
zEKEmQS~4u@7HuH`?cKF$-Alh4MTNyg#lYXoM_UAx<M8;hZAVXM^dHm@{d#pT{}e3h
zE>S$cUq1|WbVj?tsLjy}ZNZ^vjSNGx0IoAG7}w;c;+DKLEH{Ot+Qkl4;qK_nPC<>A
zA8Ogz;qHNee!UUu>xqQ8aF`9@NH<0xEjkccG2s&SCZIaZfYqiF?9Oe%+Q3+}+YdyB
z+{4H)Kdg;U!f^Y+h?3)QFIQw+BwRP9BGZzI@uNDW<w<LiM#hCh>D<?CH0CK#sD???
zH)_@RxK(*n8TR0^7m~&vP(n}-2OhI0SedF<)k$L-UDd6>554L0&{Q3&PPOe>p2zw;
z`fDp=!>93_0++trl}ufSuu#Udp4X?TI@Nc`dYQ&8kD0Dq)uGb4rE#nOJ{9m3^!_{G
z>sjtf;JXt1eg{2<xw23H-4K?`{sZB5*?Fw;aS-z_-~51YzxW=XU;3vgK45m+Ol0}T
z!|LsYipUVOC!5fboQkm-d1%W_!`<s9;l|Z1xO%08xBEMSMB!QS)Vv%~NO5?$GY%&N
zfCszMaBO-39vYv4N2aC=@N)6O))IkMHJ)58>TO*!Ub(RgFWuCIbK6Jb*&C+f^<B&H
zm%W?u?*6Os&c01}`~H=9>)ts6#4&h!TQi<qSA&xR({ob(jce-h>D?{(;I?YKcT*AG
z-ky%vHk$Cx)^xnRF;iQ`<?*Qk+b#*c$Hd~mh+yn#^wh@lotnZ`V8Za&<Ul+&#UH0<
zhT`#AQ8+R&4E#L%Mn~ZGCVyO8?Ttn0y|G-v+NFgqn49B-iIxFcZ*6n99a>_0q0Qug
zkpjI&f!>h#ekhIVgWO;{`2y`w6zPcE2p8nV2VjUms34oOEs~IAF>8y08YH~slJChW
zDTs?pK%id$eB3-VC-j`$9O3Nkf?oaX(El>@LPh1}3~)rdizik_#9?iO0ZSv2FeN%(
zU>Ae-Fn=sbiO1f`BK&2+G+bAjifYfRknJa-Xm2|d*bhLRqZ^t$+>z%v5dK$*dK^3e
zmV_9jOUP|aPC|ZKI<gFDh;jBurdtqNL*p<*6k@xdFRTu|k?mrSv3}kd72t!A{&p}W
zMIh6broE;z-I6ZQ8z-$yYb_GVN@G^ElQzUA$R|TTNywy34+dpybTqwMp{YhpQ~UZs
zzBH!ml^{*k!G^}`=&z-(U+wG9)71U+yjOibHd30ZLjhXlVY*7A>C>-d`nvVk>sgLl
zrm17@E3na+u0KcP=iz?OyaW*U71U_ls_(O>(o?P=))Qv`sp}QrmEcF$!;gcKd-|pC
z1#Xx92MNQK>(jM--(AADpL5vEulVrNclhw)x0pU<4q^kN5$@}U)Zh?|O0-~1q7`HF
z$}ys>1RIu5M14g(#t$>Y?xC>}I61?j4NEyR-h{oQBJogX5>Cv_$K&%Wae7`Qo>)?W
zcWxevkM3>9OWPaq_^N80A@pt)(A_l;&xx|TaNAP6wQmRBd*n8}`^YW$@c6Cx;OI5@
z+rf2sZP!dZcf(jbwW$Rsme-0(tjF^kYw_}yN<6<lAFpjIz~62v!{4qi#2Xt$WiB@0
zm6hpuX=w(Yo@K#_u0$Mci@*ad!FZ@G9Cz2bVo!@R_P0CZ_*8E^IWH84C-~!*YI|H;
zJ`i^d4a6;t!Prvnh1r%~n4jf<nHd8x-qZ&j3H>lSc`!yK+GA*ZUo?xlYcdRwu(m%c
z<NBi{svok0d!Zn55DH@4QIr&ja$6!wbFA9j+9Z=vfM<h|@3=@ygF(XDctbM6!z17;
zD$m>1U4xz%$K!dpz|GeUeFQ9nB)s+QD=Kf`U{pAJVeDW}EcK1RQvYbo@Cn7Z;BYki
z`Jma?9TOq~u(P}X2U_c}I@gF{;jSnS9E4og{umt?j0F<5&Jv}!A}J9=qaqPz-ybo)
z-pDW;CDaud$HpVeoB=~v9HQi&%$}YC`A|#~Roa>qkGjx63>OtTJ2e>%)<oo3;$ahD
zX2|Es%gn)qF;sfei2f;xL4%$&I5sNT5FJEScN6=4gFkZQh}O7NSjEA+JXWiRaOMRE
z_wq;2bdFc!pk4Od@|d$XIJS<BhWngt!C@jiNALN<Z#G^kMo#=+vu2I9A_-Ll-?*m6
z>T%Aif+=4ksm9pRId+hM;9z3Tv~X~6(B4MJ*%Sl;-^#{&J9FlYR-Sn|&W@cK1P$+z
zd3lcq4<7t&3ZAN;tGDnTInIvb@R-K2ed<+~+^RRGsr)nrXqs9MpUy$aydELLfy}%<
z-@D6UG-?c=wvw=v=hQwPt8(0O;RG7X_k<q(e+PPWJ@oz^_%8k%@b!cp{a^Wb{wG|#
z^dmmI^fkWy^%AB`nTp7e2*k$5A|WUkwULRKo>Po@Ekm(-&Uma_)rn;b8nI(>6YS2+
zPQ~fj9DJOI)3b8%@c3jrI@yBzI}F%2E)|ES<l)$y0=#-n1Kz!TIG)*5i|4nGz=i8O
z@$&7n@y5NY@Y<eDc>RIv@ZQn8@zLXZ@xhth`0U*M_~^`?`1{ct@YddSc=gT&cwR!^
zvzvzF+089Dx2hH==NI7E>~uW2BnNM7Eyp|CCH!4s!MO!-czKnCw-U;-v*N_WNbMy;
zdxr(#{^n5JH#7iuH4MQ1(T;NOff9~~;r=#vY|6L8>MZtRI$>jp3)U35W3Fuw=I6L!
zW~L)1raE9uvbfn5?G{(G7zU#$aUkmA>`@Wb7scUywBadP!F`bzEebEe7Zn+a7*b$E
zj)b%(6W7Q~gV~l2QyO3FlZtrxL4`#{!Pn1UTkhS<#Z7~r2j5N0_gYAJ+gFs{AW><)
zeR{#*;D{1O4~%yY!Q#Mp%ngdg#J~uQiHt<Kgqalq9;onlLwkHUrb;M1Eh7a})6Ezf
z;*ZikebLt65u1WyaA#l?t`7`HP2c{=aB)V2(?CQAdLc5{2N|hG2~~|K%FKi<IUYGl
z(Wp(1#q|7K%*x2elsFR>8%)?xkc-(xxo8mhm8F`Hmzsif@zEWlG~khP1fB*x{WCr8
z{-YavO0CiOVje<)Q@;oZHZ&?Y4#K5z+87~3<2m(iTQ&EV^Hgap%W@o0%-2BDIE76=
z*`LOSM&R(81RZB|aM~Jw_yiDpgE^>q%a$!#egaSJv*9rhA<03*901KUE_T3cbFqUL
zUU)&fFQyUh{H%m2#}(3ej!u|!Mg?b?aKQ!6xM2Q|KKe-eE(j;iyx<}X{QMk*&CkG9
zRJce2?}5ks<G`&_c7&ID=PvEgp+nmBSg)E{!eyoiKz?@wAJrV+v#ail_ss8s_szl2
zG(wH(^gjdNmAohD^sdz2{|fxB<o_=3#qYn7?I#i<e}W(659}}BUc?;v<Czo|gv7XT
zq{K&JcwR1Unm--4tmwkF#SPefTNn24?8L)YkAU5ylad4&;n*`g1_#Gk@aUv89G{Va
zlLEm<r)A^7gbW;?TZHE}SL57<GVyBs?cSOA@WGXM`L4xy`JQ$7>*3q*!O8pZ;gb*I
zlV=X%<7Xbm=NAs)?~mVyzaG64Z$Ee)-g#gP-nwV077o9D(>T1ceH6|tD#K%Qb45w!
z<Be@q`0I65czLZ2udYwQyH{KB(&~7ep5ZT{YZwlW^~3I=9=M~%9rv~PVPCr|j!pH!
znRy|2WTG$b8s>)Us~oYp#2zbidt+;r7dBS~U~Yyh#*300ozw?YGhHzy(+iWb{Lo@_
zLPO#pG$c5nHjc9!`k*klHwwZANJ#4lYq&G=OyMZcwMbZ-21|+&DH6(B1bE332J$68
zTq7@18klcjpf=OOO+crVo{xus&)o$+ex7i3ae(`PzVPXF72@q3kUhW|LkD|fk|@1S
zzc6%0L}PS994ZBDm2qL(OpH`fh$WGssELlk^sF4rGFvgy$qS?VyWsk06CO!S#qGgS
znBwh+T2XSfp}|P#*9%d7?GV)OD(%f@xmJOgF&<@^DJV>iMs;!mMyF(AetHpBrDb7r
zZXOn7WMI4~z-IY;g-LP9G$&!&lrCu+zY5<7yno8xWnIU%M-f7oBsKzNY(Q+VY;1g1
zM^JEP1w+@oygV&WZEdagFrAHyiw&q}ObU8zII4m3nDbL<Z2VM&R5S{BDj#1it`-5{
z{cx7X=+UFKV;YxfqKadF&VJym3N{=X>*Bj(2?Rn(8Lv%s#<{sXrxq)q@f^XzYw;Xc
zh~a0~pHs%qLuJNvf`IqH7n8Fb)i9MX6(Av{K9~MGr76|NEoXfYY*eOdezH=(teeN2
zO`_Hb<n%h;8==aXChA^U7q@!<Uxi;!zY_TV41PU9_MbXG%Xi;>CE)uAKVJF-|NKG1
z^{>9c<~egwVKKoLABD!!Tuf*v#p&JKF|Ean8&)-9&-LSQ|BlhPX;vZZ4t2$2_wZoc
z-5icXovB&}#`$Gscx6*Po>^Umho@xX<f2khXSH~IbtzuGemFkazW`qzyBcro-GJBk
z@4$zrcjJ?%58y+A-3QMJ?4CP}4=)_U`%fLh`=<`z-NX0by@zhVJNK`}>vvAaUw2Kx
zdpo;ueoYn5EXl+9)dhIx`fB|BmLYg`lLfDDNyguAOvU@xoABDkC_KL^45#P#;rKK^
z>}_+F`yGtq(*tm7PB4y57xmZSg*%7(;_l(0xUta_%W~|nreqLSm3m^9%^jTvM~pQN
z#Izhwj7=SkVM+Z_pX7jw=>8~==!?oo32TE1y?!W)bV5O#7qSupP?(W~+^iI&SPU?w
zn6%~IQ!L^sX-F}pXmdJ)LPOx@<q2muDm%CCx0wm}d<8^YC^W#|8vzos`Z_tl^C~;U
z4;+N_!Op1l^2S(EcdY>d7#SIXrl@d~MFykT5QEIbXrw2_A~zu(Y3|-=jWS?Hq7_|!
zkr?IdgU-S3SnD5*m41O3@8ydzUVc~|ZNNOQAWU%eMz-Bm2puR2(3FTGb23I&mZG{S
z6M5ET<O+xy%xRdHl8zmDC0J-MW43&jF`|NoN{Cx3%CM_#m@tiRx@CB)@vZwmJ)p<N
zLgSBK8I=sDU{N_S^yEBQ0*>q5QIT=ZD~*kk3XlzhVC3p4YP~oGPyNJy{d#pwQ_$n0
z2F=aQ+CGg?Q{gh5ja^Mg<Jxt6VK|i+=K@oiDnO{Yy{wn#8G3UqKi0+Nsc3AtJg2}$
zcrj#D-vjH>cfQbe@812p`%%toQE5`a@;t{Wswr(eM<bkgEnbIZn2(=_pyN7)Oe3sS
ze%_mcHS_YjR{u84OCa)?`TrB}DOX_ocdo#9B>-LteE%Kz5qj(d`BBu~k3W2auP=R$
zpDulcYZffRa9a+>6jfklX9pIJD#rfnr(@aFLfpM$DmF}S$E{1IW6PvE*gY~a9t7XR
z<BWK0YL0}L)i|xGuRJ`zp$d;l2>ST)N}O9?jVISu;Qd`)_*kI#;i1iV|Ir)p;i<du
z$+>;__*ntog-7w;vq$ikrys@J=Z@grvq$jZxuf{t)IPj__*T3n(0k>MnfR+H#rN)*
zfS0y4;v8R0v?d=fZO+H@>uq>;RT5s`Y{vUHq~p!42E4K%9xtwm#j`77#N%*kZYUm`
z6^wI>V{vkJ7>=?Rw=)2{1l)JDa7>;jmgZcA6-9$EJADA!V|t?{q8COc^v1+YXN*l7
zgwg4)XqND{B&t910_;#2a21ODd!sP0KMEs6;l=u(GRKI5oD^6j#5ANN!<?1|lO+wt
z<P=yfHYCR<Au2oqKEA$icJE#}g>NkL6otok)>7$txw#-vpcfk8jbL|Y1lae3#nT%F
z-d<=855P2297cxuqdw3Vg+89h2?>NPE)rIOmDOxOMp7co0<?V3093g8Vt7CVrX{Cg
zbU-LZ4RXZ1uy9n{55Nc~H!Sf8!~>$>w#OJT#I7&GdiOz=$%q_-5&4!Bq*;<+u^M2t
zSx_k8o0(UD?L%s?)S7{f#U)r>R)VFKWf+&8fd!*SN;8qhA(Z<Kc$&+9820j6-!I(_
zO*MG_=m`vhinA8@tcneufMU4G#YNfh7;4eDPvt>Vb8p#Dl(BJhkpLR&Ak0{=YJ|KN
zol1|B`&B+JHN|`imNb@8kYvN*JrorcX-h?^MF$8y8b1$@xyS&a$9;y9>fU&)!gcPe
z&p;z+SdKv9J<{}3?0T9?Q>u^9(wn+BK8vKOaGT1L3YGinzWI51%tr<a<eXEiKc`<O
zkoTxu4?g93=qXol`!m-=PkB%9>HWV0Kkd1msJow}{KsE^f{Q<XgN5TKNF-yyb)xw8
zU%eIwZd!~J_pih8`!{3b{IR0=8gbLoh1k1g5$qnDVZsw!SYTR)2EEgB3UFk)Rn%7@
z-ng~_FJIk^=eG`(u(lS@Y^}#94@|>n2N&aSk8H%pXKu&m=l9_A=MLc07mnh?7f<1@
z&z-^B=TGCUr%&LWr;p-e0paJ*?#E|O-ig0HycsXtHV?1eHWP11i2KHMt$1;ZgtXf#
z@WRGCoLy$Y^J{E)cSk8+-;l4pV)BW(Cb_pb0dtsyv7w^u!X>1Q#bYy~aIh-?4|n-s
z-#A~~*6fWPRi4;T>V$<EeK6bB7n99>F*dO;MjQHLoV6dO7rA1zKyQ@Q4UHBz6h{w0
zK~OIg1&Ye^?S%rr{wRuY7qu0FA^B-YPd6f2pl3`M*jcTj_NeeuVKJm2E-VTmegWEi
zS{H$y_TE{69$)9^EolKh-U#ybMx=l&HY^BnL4HW~_eFU^BI+VSFebzeOU+@J78i_q
zQG$j3KFEoRfHg55*|t>V37qmorR79N!X!YA6=0?~JEJNlPE=tm#)U_sBOnl!a=i{w
zit8+?SRv;YM#rMj!w0eb9g$<SAu~M(wybPq<fJ1#!-9;|6sgyUqJ(G+w;JVpF=2jT
zHdd76VR>O5W~JM3)AXsL>bi#{Yp|oT)0jd1*FA)VIAnvN@yD+0@ZrN+1EX?dM*xlL
zgK6xTU|u!=9xGsQMg|)^pLu=o!3SCz&l4cZI2%I02&povNwu65?}rVW5AGFQsixSG
z!E;nVG~NS2Nsv-GFvKJ*2m`7*UYjb3YKrRg`0?Y~Gc&ceAJv>5B!mRP#rvbuWkaX&
z*_&D+S*_&4XLNk_$j`xjHh9*>KNfuEsE&D_<yn>rmG$%cQgso;yf>;omZ7QVr~K>$
zF#j$1opON%meUVeQNUB)6MD-39r*t28~_|a<Gx)FK0WO8AXMPf`;}m)oa^uX{Ou+D
zeDP;|d+}F%|5Nu0IE_P!v2)8p?7L+()-LVBBM;n$4RdB=!PqgF)ZT!NlD1)H3+#?f
z7pTlk!GmLyuzxgHP)WgwIazpRTde@81{XFp;QWRLZMew`*S6y0ebe#z;idTavF-Te
ziJhYI?!gx?JdCejKaG!HejM)#^xk{nDSZ6;)A;*~kKwad4&&R`9>x!^Jb+J5-hk(B
zUx>3iCgSXd7F^ieh-cPU;mvEC@Xiel+5qU+w-)0s+skoyVl*Ba8-=5jjku%17ds?e
zy}8y^LfHW9A1C2>t2b_IbirM%F4#HD6<f;&VO_x>tjrUIm)aXk3f!=;zzZ|8Jut>B
ziqFyulM7tXmNF1SlLw(9&R$eqKNN@eM_Gge@&o!KJHP=Yi9x8&HY3L-3eOndJ)br!
z14$_<NKH#eMp_0EB4QCC;iaF47hGIjHQ>2;b;Hj?+{e>H^B^wi&WEewCV^j0co>F?
z${HFPh~>sG++LD`8_R5%6d#OMV;stj3CI?eN8sgJOvn|bR%WxILO?XEqyVKBQJlen
z$PNv~<g8pQC@9AGG#ffiCX5LRLc@Ro=n&Wr_4P-cgu7L#ImonRA<deJ4EYT4(cy^k
z_C#`kAF|C7x|<U)Bqb3|qBv*dXJAEXA?BJbxM^&sNS;fAHBoV7%w_2PQ~vj(q|pgI
zHcn;y(X*iv4s2jt&4i7H4eHTHAJrNtLs4Z^a;i6xjhhem8OHIM9F27lD5^m+9Mvy3
z&vY7<9y>UghsO2k*r~ztbS~$_<>+agJIge}fnhG6k<nD|Ez`MF6qP3%9zP$!$TGY?
z_O5b}uTpxvU&2b=69Gp6t81(9oUr0D^GxG&MfLs<LP)K$!c|n*1FSxmGJZ$AFM@gA
zym{L9#`{$(#PFIlUPpgTZh1^+X9qtQ|A-JYG+w8tE70}yKLgvJoumJe+Ut4zpF)r4
z*pvIq#Y_19i*NCZ{C)ZM+n=#+{RYgNJrS$sj==Jn&6qcHG<IxSkIuT07+2ej&bD%F
zS~V8yXOEEZc8VDXCne*t*%=z_4o*tJx#i_}W_2~rEH1-CQ?hY-Sq+}sIvlUu)PYYA
z%)$o`FTndpw&2sVcjL<o`|yo`?VC4G;ggq7;Qi;1;p5lN;G1{Pil4#PZyv$7Z#;~P
zukOQVXK%wB2iD_-JLcl)t)p;ec@0i1lF)fot%SGrcxCGlyt=IrZ*I@W)62~`H7^Or
zrzhivA#T`|?||DI194-mCvG3=FOYM_7V)iRj@Vx5g3YDQSX(d{D{=>8c3MBo&KQiP
zWdWE|<d2D2?iicygi)!31TF(m6={d!P&?#EXj>9B5M}YM$P9BtK8L4dbJ&LgsR9^_
z_7*b<YtuyCiPGacFOn0J5E&ebKv7e^?tCqy2OI@>&g{sLeIHLR`1|-EFwh@CLB5EM
z2tq=j4+<i}FkBSKWUCEhg95QBAp(1gMPauO!ODy%QC!ieNDe_|dV;_%26^%$&ofz(
zZOKJxUOCEhb5USTL9W$|!c;RFZJC%@RDng+O;}W1i5ccpOpc1iG%ht27l#QFijTCU
zqrses%Cs!ljLGnG8i-6E4<tJdLb51At2G@tX%<w70vuwF$N0=NOixJ0Ep20jVjL(e
zAuM}oFN-TekKG#G6#N(Plqm=j0&GNjV*{Y6JTwIhntorEWkXWdv%V|kSH^~_tY;m(
zK3$bj^{`COdQ@H-&*|$``}(?QSNfc^E7kx1q^WyX&d>R0&U@24@3*I432^!}y{q%e
z{|J76<UJwCGOCUrf4C?=&u{VT58vU}@4th@w>Z3eFUB?2VaBKmY+5iHD`!u@c!@0P
za!OF2mxGz(E3j^1D<(Eoz;0is0r!rI#-XWZoLQ8MBQtDxd{HqTn_GydSJmU}N>O>T
zeQLcx>84Tm_`zv-=l*H<`>`$f`nlb>`0~T}?AZtL$%RAsL_*w;Up$UaUpaxV-a3VE
z|8f%FzI6oOzi|*hy#64*e)cZ>O%&q$2iM}w+o#~^bxnA5b_ov8EWpv(xi~Q|180}p
zMDb+__zLjov`E}J)EhTed*J#Z-lFazv3pb$Zmjpgjg7(BROX4*d4sW`*ach4J+Q9W
z6$>*5Vtk?<I+J^0N`^B!ZO#~DbHUgQcZ^DNL35Hl$^>`?0=!&p$*6wFiF8DEoHq(m
zqmh?kgf-0wOInIH&5Y0TQp7Ft7uh7xiiwOwpsz1{+&tmwCa`mNhZD!-dGfW5Uft01
z_J+SNS3mYfe6SzVf&(zrl#B_Y*d_}=$47=^X-pV48>6r;Edp~aA)-LTQJWHl>a;kd
z27ANg=>xNGxRg(T)x=H+TekwtNEJ^*ab^ZeQ&Q2Knu*zEHQ3nHf-N<5*icx2RXLdw
z+8Zz@KOYlqxpHkAN-P#cxw|0EX)w~<+z=lcjm+!<O=)mA4qt3kZA!pIqXlcK8>ID$
zBKW5?Y>w9xE)WfuF(K(Z`h7#{*~n@9@zV6?*dS=CVJTDl>bx@MQ^rt~rqY;K)lG%h
z)7VgGJ?rdg`uk7-psBj`=V|(T>UoaFMy~SoJpb?R<^SKt&!BhxzgbV`J@<6pzutNO
z|B3(aLXWP1$Nk?y??;yZ4!`^)Vg1kYM@FExW#K9`7ZhRYh$?JbItgPNtI$|ff{M(1
zv{sd3*_=i!o?eTFybRdw=}5$VT+Mh&ia^GK16|2DJwG33msa51iYh$2t{yLJZN^L2
z4#itLN8y9L<MH<gr{lHz7vSU5H{tsWd-2V42k_-{hw;UQ<M`~tNqqMFNnCvMEPi<F
z3H<QJas2SwA^iN>eth@L9r*D0HvIkJ75G5F`O=OioLX9nV{`NI$h34k*kzFWNWhu-
zDL6IPh({*J;g(u~UX?d)s`0~}Euq-g8G{`|oUx(A8H+ReV}Z4gsJlT}E8tsE;EXxx
z1JGp@;H3{nm(>wN<9nk%wwJ(efcQW(#P>&eIA=K6AuHf2Sp9n=J!~+FEfFZqG$1SO
zvdR<aaZas*o>9V&xY$?(`Exv;o3=ilwrrHRrvT4OK<6#)E8!)d=SBJXNf;S`jKCn2
zM~afm%EXuqE1Hwz(HIwwY9CiL_`6|LVkjD7g3yv`M3bm4Yk)6uB~;CJ^hEk#FT^@|
zAv3{%Ap*JPvSJJ=DnM>lI<j)D$jr0|s4ZyB$ikG;3aqHD!@415SYD8c@rEc&x0<j}
z!r<9?#i$eIm+a@Sb)*oed1fmz_~Ig4Dl#%skey{lwuIOv5s_Hf+``r=ty|zm;Uevp
zO_xuqh&uw0#)hJd4N5_VTQ(SF+-F1RxldCKOK<8t>sO6Pnf{t8o!3^TuHEzA)HRr{
z&#xM?GIigoysAs3v(YJIS(UHnXV4oPF7xT@{D0f{nXlyPzLfLcdOAN_Pgnmgf8;%(
zcO~%Yy(jSW&h!5Qo|OObCw%?Al#$;i!`olJ|0m{6T8Qy29hlWM5}g9RvYZSQW#*tH
zrwHRlwP4MP(HK8EA2})Uusbv*6{qHBin`0hk?AHJoS2NWOG@zMqEZ~5mLWjO!MQaR
zcyW6JUc0Fk@7_5YAMKxoH}}lNhex;K$7go~uO7zFFCW9#&pe9H&OL(9&x^WyNx&y*
z7he>W_u@YM`qKTl^z?1`@bE^wzi$EFyJsw3yP*LuZmq)$o9glS;$rL@m#Fo^?r96r
zhM*i67l)mVLAasPM|(l{&2>JaN`pn|xnq5aBUTqWVpD|&Hp#ig*$$YLY=@3G@gzI#
zIo`<R{-}wvLw#Id3^NW!Q<4K}WBZ{ryq7l3%qGCI2H9(akjvBKQ6wtQmXf56$>aNK
zO`?L#64IJ1soE<XLqkGD$$4t8ZFCcL$7gsPknArY@%QnCpRW)6{d^GS?}wC-P-F*%
zqB1NREr~{q64h9roPgq(P?W}np*lVi#UcKfQC*JCqI`_aD?n>nI!dEsQ05<uHCbht
z9%V+Zvxn3ZgyQ5x)D`BTq96}BSvFX+Q;<`bj(kydc`2gQOlGv_WMf`MF_u>sV^Kx6
zsLCYFk<YL=zX%-}nWzzPH<-<+m$VZ3d^r}grt<Q0Z36ZrWC$dULBW{CMMSwC8`~z^
zvg{HI|AOA{O*Lj38;RcdqrH-`5nRdGU}*X}RhrtWhC*WlWaFi&t!ltL*58jR&vbQt
zbshcnSf?_zuj*5E(A4$x#tsK%>K>K#yypLx#(UR0@2RKj!ADoX>*@SX{+;&(p9ZhX
z|BiasgYJ)@cj>2}aY?@VFTVc;-+nLuzVda>6zEl#RAchE5dya+6iU0uwq>ETpd3>s
zbYkVoPK+3yg8~r>c26$J!pobB@cOnQTv(eUz{|yjRh2lue2A#MN<6)?5>KuulTfq@
zuV3GWzuek|H*cMY*Y21qVeTS)aqMbbd~zqgd-5K9_4salef9x-cm6^AaN!~R{Ne%p
zc;P;Q-JO!>CVckjM!b99BD{U~6uf^=JKo*dgumX}iof1G0xxW+*3{;G?U8t(BN}%P
z4HVe9W1FZq`i7i=*jh9gH`cmhTe$<)77h?~=Y;E<LvU?VD3%qvVydMd#u$2GeA-})
zwhY1uqXTN9duyQMI(gMG{ZJ9n3q_&5kR4(VOMpGHqdie+ixZWXjI7i|q@@^OvG66`
z1fETEzTGT5A_~6Vz8d5_sPcUH1~Yf~xVj<8!wZ4lKAOS{4)8-<a1d;PAt($CMN>>7
zMwwGln;=k)4ntvl1j^ZIkra=y#l@IZF$B{ps<5=N86Bc1IaaR8KM;Eg>Tru82a^KA
z&>9tmasg&xhA6vS0i8(nf}AwuXQd%8J4?1XD6*!bLhiXW+a`79VMc8}W|U=NR(cv1
z<>q33Sqa)LCX@$yqd3?HDS;j+up|gD6H%U@39HSFOi_@Ta<6UmL$x0gO>vJ~t%>tL
z;=jlbSv%A<eEuNW5yA0vY&0~rrLp0tWA^OwAv=d(uu;(X<JKF`>w9o%tV0<aoHEq_
zm9djS4bf27RSjGj?^j)0HF9ODk?T#}AImU5#~yOvHSd$1B+B$aSM~R->;D8(--mL3
zray9iA9`2ccTeZ%{WHH3{OJ5{d%A+20-y4W65f9E^N;xU*I)4U#Y_0|tM4#v@@xTJ
zK4wiHht9Fhs4Oc&rY#f2`Q?~7Z5mdunu;+Um1wA{g59YF7X0P9GW`A4A$WOX4xSRl
z_tc_7JR{IMCn4^WON;Qtayec<1g~8?0<T^(22ZUSil?`<;+0#v@c#V^@wW$-<F5~_
z#7B>8!51g4!zahK<BOBm;=3nq!1s?|i_ed2zz6%6;H^8S<JFrd;jP=o<DJ`E@zzas
zc;mWyyn1z=gwn;LG*fYOx&==z$iayjR_tu>$Gt6)0>B7dTke5ft>L)8BT5Tzuc;P=
zSMGxAnu2k4Lok+?cwu_RV2p^j!?5^X=ty<OaMNH^MfO5vL~m3^_CZOQov6I4P!!$|
znG(`k!`)D13P)9r83ndvq#ENjmB;6I9EfZd<z)~EL`KKKPe8=?)OrZyI2O-cgPxl}
z&lQ309tiUCM3A=^qJjdD5)pw6{~(lwM54uD!U$1{1redhix%Z)N<f{}j1l>Hm{?VV
znRSg=KCA_cN(-^LNFbGO#kh!ItP6_9wxC$7h>6EcvkAjeB>YT>MNVomiba_fi9##P
zv>`V)2f2Cq$QLP}FClEPIT;m}B#aqSfax`*SX5kug_&8HWJuK3*J}%LMMta;%0k_c
z7U+!Z#4zOBjL6EgA}c!^nd#YR8_}V$jR;`#{Y~3k2A<wj181XQ?;3kPIS-b`A32SU
zK^gPWIPa9hP1p;`M#Oor?3q-dAdSOR*rTb3h^k99R2qk7aEOL#STw?yFAt~b%hOaH
z`ukIL@!I<P<v!EcFqNtMV=plQ&--V2Zq+$eH%*mg9(~>ar<nRKdpbW$Pgl^>J3muT
zSN|4QN~0_I(f<g1e+0enf4GRxzWW?s{qPlj`1KNg`SmBvpF0m$X}hy#jK=ykbI?>@
ziG2AzR}@!Z>B0rL=ISMwKBW~c_4TlOcv1r1zNTDT?c}-D={P+%4Nok{!}F_a@Wi4b
zoS0|B%Udf&kqyI}Hw?ua*S6u=4MTBmeFI*;t_^>^Yck%tV<KL>X*6EFV?5s8Jq54t
z8jn}*9D~2?nTU7yOu}F8?!p_ljl-)qjK*u%kHcSX>BRYU<pR<YJiVp_k1xr?p{XW3
zFeVxY#>e5rEP>xN6CNBJr@e@1_lPL%ncnV^G1%P}gKLL)U}NzhEYInS4Ha%!UFIRc
z;3~)-XqS+-C9xl>Bke@t^+i=wAJoPTK!ZU57;BI6sD8)~w?}HI6LJh;C`wB}fz^Oa
zfeF{9GYeo+Mcr{AGJzM9U_gL`seCtW_Y1n+;V$6w63BT9^n4}6?FODF!UTkdkRas7
z#v&^q2sH^wXh=yxHOJdUNm!c{gE|RQn^H_@%ge>sl44A&9)cN#Sy+{ujGJpRFh4C6
zi&7)8A|VQk!a^`BJQ$tfp%@ww1Dl&CvcdzAmlO?iWEd<-iCSk%T2>ZvGP6-E;44ed
zM48o!(zH~xhyq+%R*M}~4OneW#k@!#+*fPGV_j9)T9$@fm)=N?@Ia<D0a><G37d0}
zY_Vb7)EVkgUbjgD(0Xy@|4d_Z*1znV#zvyHs=?6s1LrY+^vc*!6o{D4WBn@;Rb4cF
zIU3KYI+fjh_uZP>>e-pWe9E{_pej=!(&wS6W7g06qNy#7jZPWC%JF`xjJj7ItMXU6
zfBn7xpJM91uH-$zN9SkLJ3r@t;{0xuUkQ8)epiCvZ)rb%FTcO<@bj-<;o_xF@Wtov
zWA2Pe=xiT~<%_#8YibJ`tMXA*l#8KNL$GztGTd|f2F#t-in_8wf!@R<yt<_r@7**+
zwuN|MZ7E(@Q-wFTw%|psg1jmpZ{JXh*RQL>>7|)Cy|@TR=H%hnygZy(T!8Z%Ih>^)
zk4w2HH<sb0n;P)uox||zZ4G$+_GY|(YcpQEaj3wq6)#=gg69O7&#r62nI%P1cPXA*
zC*kw5LL8i&jJsNcu(R0@cgyy{&Nv*JBxx<dxK1E<OG5yztMbOSQa9Y(5QuFRE?6hv
z+g9g?m8Bk-mF<ivnJ#Ee?1P~R{V*cMSrnEl>JtW{I?f)|@eU}B?1#dzKFA2}2Sbn(
z3T*~act-8r5H<<@tO5$Ii<fH6fWef4cu{GgkumV}_JfNnU+m+lz37L~<Fh?)Pj?A<
zIUdg&q2f`#-bf1zMR{V9KraxD0=yde>r*227sf}S)?!4RD6V>=0hOsHl$w*!nvsgB
znMT}Rla6EUC3tvLDRx$8;HJ_HEQ<@r>;NB3@$tjdC<CfJ15h3kfWi<zm<Kr`*1;L!
zL4hy}&~hb&%@yF~S*$3rW=iOqj<S?=fqfcgX6IvLNj_E>f^fdG47hP2PK>L=*eGw9
zefpy`$A}z`3(UxdMc~;sVUh;D?xrpclz`Ko@pUg7uMK9=dT@Ua1RBT9sW+4nWL5z%
zS3%*pID(0gU_(-KM>!anicd|o;^1HA;ecYMaf}-q3%6>86sEIILX2gZMqqNwadLdF
z#(lnwo72g-uZ(r^UO48CQ{*_HnCIzgdKvGJ1A;krPMzZvF*bG@>tLF?XO5%e)H@D@
z=FAG-3za7qH&83QaM1GV)vL99YBiPr(@YP#p8oHkM^|9`cYY=CT?u->K~8tvopupF
zT>4V{GyM9~SNP}8-(b<4saQ07A|`d#qp~Pje$&!kiqbKnz8u$VT#UWDHelwYT9oIR
zVaGR{JvPUHr&p%p+;S_<F6A=r<)XH7aB^-AUJ#JIytNb;HWuRK;&hx?kdM=gO0_qf
zon29aGfVSvbgmVrmu2JB@(et`y%g{78Y&8~5%1mIgx7DZ#;e!W;?*6Ecyf6ej?B!(
z;psVeY;K+auUhIU$Jyn@c%U-@JBI}e-260uXhJgXA03OU%bc-3e;`(7^}+hWLD*d4
zg6)-_*i_;u&>M_p`GYVkV*sY457Ja#ZG;^f<NKk-I2g@_K`7<$lps5lNAneq1CSHa
z2btlH0)<#)N*HJp@KE8US<KpM8!4jj%mR*Nvqb}*mrsE9B5n^+dW4>*gtBTZp06l7
z7k6g_2Kgc~*iReyn-Ls@yx?FIhlOcy8<r`GO2XAjLmY;sr)es$+?)uTD9l`eU3!oY
zTEcyCT>i0sy>1!aoH+_ljH<;}LlhpWEX9HJEbL0QVYWv&CWpi!*S;Ty%QfbQBx1Nz
z0HOywA}%%(8M&Fr%*;S}b{_K0R#C6n$hBr6Q^NETn*|lI0ho~zg5xc@cyeeC_El$M
zT%Z%$B)o6UwV*651=;f1v$AqAYE0*Ek*xfj<-ej<ev<9aTs|J)bbaa<1pyj=%rx~(
zkH*RNZ2VMWe0M9SMG+ACcg_-eY*d_;!DIfwSr^lI%t5v^&N5(_!^ZW_JMU=c_<Bj^
z<#adJ%h?CYc%BerIiBP7c)tv*2`e7+UKv7v{PD-X-5Zz3;<Y&#m`g~}IN+MID0&)a
z0dP<=^V0}6E(OI^R`{M;*1_kBJkRS8hFoce&jSBXH{P?}^}xH5|91gM*8@-wH@z$P
zX#f=eN&evd^G5;ZccS>d{|>8`EXK;kb1<YN71d=aXdIG`)r%%#+uFI<y>kQh+_fBY
zW>jI$^hVenpKZm-`6fKOHbcV96g)gBQClhbzV<{Mn2>_g3$t)`Wv&+DJ|k-G?1~|H
zW_2xI-drzUgBLbb;(~;#XBKDTaZ!TLt}nnF*H__H37KENt`aY9E5-Ah%ccGb36=Bk
z&_pZtjkn;;qGBmiDDca~fr%2rw#Nwc!f^Yr2;ABnfn9t*?x=X&C<<>!wHH?9_1D7L
zmAUp<o$sIxSy@%+hy~dWn4H=l<5C8oC7};W1#lH%cBqT%kIKm2D3asCkgHG>*&o><
zeUTN?-CLWRnt*gsTULQx>Tkf~doI$n>1NTf30hd|;pqc6cLL7~d|@{`G5Ac6>*e|S
z`@k#E8~(vQ0zF@tLjqA88HwuXXbGz$Q4`JmNMw6?Xs>c?wOT~^8Bm&TLQa7dxmgwz
zgal)__;sb3cyh`J91-B{t1rPF#aXz&AP?`fwBu}HEw)6OF>0_ECdR~KyoAoXGt00z
zFc#UOIAi77+3Be$EzC!u)rRW)Lgb{f{%qLTi<N5=5JsUc#s^D{LD*&v#YR&Y=EMbH
zZmt=V^3&05NkN5t?gD{qS7%o@@xWD92r~)a_|65sSxZiI4_p5I&=1ux*zna139cwY
z_^9!9R6;!F3M`xrK?ra<6@Tpa-+#Y$jz4f^3S2ZcBpR<l<2lvI!TGI(E9+5>kmprL
z1|dv^MPnLgCve?4!cWZ#p|Omb8b_7KSs;WLjg3+nKO2p!xp4hFWh}2C&HMx%RV(xB
zr_24{WP0$af8#&$|JR^*=|_RyFZk*EpYhA3U-9GjKVrtzY4W=rE#898G4)u!XcD$<
zn2&WUrs0Mii*U#7OXS~nE_Ur&3%f(pjd*l+BA(rlEvhXQ_l=FlT_d6eLdke|aypLA
z&cdN-795#n!vkGL?CY}O;M8nUX%Y@^t-(th1#p||aAbO#Hml>Y+39#n!qqo-G~p#K
zN42s@lwq!@$!r{%k%>oVWaGGm$B)g;#=$8j0bUYzwgd_6!f<nAm;f&xw=^Z-hWc1s
zUmt~SRsPsA#1o6NdShYcRajl@fVHCjmgkGgD{#hK32Voj?a^kiM}2f}6#CntB*YFi
zvGynnV^8f>D30idyl{Kig8L&U+6`s4SY(LG<2!2^(kkFt1QbRIYZH@<8t}Y*{59aY
zyL)OXk7M)b9GB<m;|-r6fB1z5!Y{}h(W3M+qa#of9*)Y85VXd{qcJ!H)uIwh9bM3#
zkc810Hk4bFP?DF1%CZ8Kq@<$UBMhy6QD~2g!Q?D6#-zrfKG+>o6T)#*b|y~cmE%-S
zCH7Y~V_viYGh>o4Pk_8MECP$9Op&*%)<M#glZ`4-dc_%O$g`PIAYpSsmKB)-+T08)
z%Cc-IO-_(-KN3?VtY4NXo{@+-qBa*6=3rr75ysncQ7+%Xl=0*FYnPwA{Fp`Q5q;V~
z<$n@9Op^wnHHq#<p@6_2KBsMQjxCLA+YtnW9?$6~!gK97?yF^+7z*>4Fk(Ys!{M<4
z8`qN4hnsXY9gJ|LN>gy<wRoNYVqF9s=i|}{K(&sazAnxc=2k(Lll(dLjA>M742Ahw
zmGS-vMWt%l7zuHP)ZC|0Niuw<5qiwa?}}yrZ!<mk^sa!XTtV-TTn{_F>%ph@E5h&6
zPq_5mFSz&}Uz8-@{ZIeIgelWdR#kzSbEjhYDuLaGC0Mm;9%jxOizUmZVAZ;5Sh8{=
zmaUlvySv*XaqrkLJT9T>iAAEqCK_;XTDrh51qUW&;L!9;?3-Z1-th+9(-DUUCZr1d
zQgC8^mMFdwJh`+0Pc13JLtSP(Ea&!(HR7n0d3jSKPHM`|j6G6L18GMh9-3^!WApPR
z#7z~bCgYK5Cfp<Uu(LHB*Vlz&`;bUcc5zr!5RA1&f!J6PfNk|bSS8ACS>Zse65uV8
zkalWHZ%ns}%CkD6EvY}6;`(X5w0V94z0lsMiE}`CRDTqQ_eEY<KMi;pkuE4ShoLmf
z*gZ5Q&7$?xnp3!Z6jfd-5)uvC@Dwj^U#;^(0gnp{&<Q**0x!THUID)F7w82@h?*1;
zgu;Ya<Oc>I-_;GH1$ylv5oq@gLYr40rkheRDJKgRwj@**XP~k$7d2^F7#d>4m>3%>
zqZ739IRYh<p9_qGdSRr1^y=hP9IS4}Z8eQpRZxui60*)WS+FoR0vkjrw#P+aN`4Ne
z7L}kj*@)^q8!C#^QBhz+VMYpa1bPLTnW&WW#aTJX5pdL7Q?PVM0X9_^VwTN_nW<(h
zO3%iEf>N|*=VD4b--ad)^Pj{UAxCwk_ur3yQ~PWL4DYzo34h3HEjfmzG|m&HBBQ$N
zxgH!Byrl9{z+eL+hzK_R_|wwTe(M!x-3m}_2rSEE)ltE+Jmvxcv^U>;Q-h*<7YA1;
zQFRbfszK6NR&DutsPqa83pLfI(p6p#Q&G!S(O91XDvirw)z#H$^-uv*_5DvV1wG~b
zUi7ZM(|_j*dh|a6AN}|6lYBq%cL03<!>{=6$6qjZ@)SfF5>e4mjVW`cV##U=ch@b)
z!e#TYVcQyPx@Ik=%$<Pov&O^jhK5Mo+7f{ybF=W!v{dXKmy9EG@^Q}?6Yh`@^WL!r
z?3-l5!5J1Dn`gthH6@}9Q}M`*6bVncs&a;e#1<TyV#EG%DcC#Oh+{Ky@a(D@Ej+$&
zOd{@Xi^0v!p}4&@3iphb`X*XLF&gmDBm)jiP86sn;MQRw*j^oo6**p5lHr2|RxixA
zd5D_x#_DokEXWnmrS`_mjDeV7?u!x8c4&+5gO<3ys1CnM!pZ(9j~sw(QF^(7S84C7
zEe`L8d;uP3U08x0kZTA=p)C=asRpE69)PTzuW)21MpSeR{5d>DQ+6Jj>w)L(?ImHY
zz%D2Vp1wW^l8`ncBmkD^aFis)qbN8KWu9Id@Mc6OVrhZ}3u29!V@$<FYZ~g(lF%x^
z8CF$}hTMFVhs2>e!hpg=2`_W<P*+ig@}g|SM*1P#jq`-VF<Mm0XbE3uRFz>>+eoae
zt-+4+0&K}n!`gye+}Jt-n`@ddNq{vZ-H68OJk&_IOz7oh+K`=@jm)fE<m472H#ZNJ
zq6)|5XJSX&P;6_e!$MJ%ljR!o3PtS|7hx6~vi$hE-+g=eSKiG!^@LAL<`0!Wc>Yk8
zaj{ROez-U&A;*pdHUxr>@FD29OcFaSxHJ;a5k7<lpP%s=8;!?Ymyg$AhX_~x<**n&
zccZFOfKk1-EXxHsX{?*i<k+xi3~8B%>(;3O#9Y9DfM!GDHK;ySI7?MZ<7cE=qf*lw
z@0lQLYHHH<X-fI=9$639TU4toQvox)RSo@rf+^rB=l5_W*F)})yeIhn2!INFoRx7=
zw%`5qBQD9WvTOEqm~t{u-dv9<3+7<vqWM_1ZZ%e{Ux%%t_&3WR$a$+*WB&T}u)Aeg
z67Fm_;_#dT>=Ctgi-76gac0~t@Y>myh`Txr*fZXU2PR6mJHv$Y>q~HAv4Bn#+L1Zw
zq6*VQ6(-`y%q;BdG~&Td6Hd&@!RZD0n!>w(TmtTDi^7f?U)<0XikpT;;>PB1+{<%g
z<8a4_AlyGb7I%({(9W+Z^u@e1S4=es<cxzvi8x_ahNA|$wir8fnFe9J$qB77eK9nq
zAF9K8qgWtU89f+7;$4vw)EDW#c1ZKIll|Vv3hoC>K!2o1d!a~xmnR@F8IuHZ7MSRl
zfoHO$AYOi?At4ghdi(qq$|@)HI84PyRGy!|9|D7d5fDJ&1tTFM6lqZr$d8Uhc~~f#
zLqjnpFbqqREVwSO44X3xFgrN~6U`~;uvsvxsRk1pYcZ*z4wH(CF{Qi`Z9{4>qNy2;
zwbdxEC`MsLf!td%5+z)Y766x~B%!mm98-riV0Fg`+&r=g*9|Gb3Q=)ubBnMvH3##u
z@-b9CUs<{VmFX!cN=-o_$L|U3ENK}?vt}SKI~#eT>}t}?m{wkl&BKS``jI0<Nta-v
zsKCb9Sj=U3`qO3T@yE^okR5W!4H3sPdf3tUqhu(^#?2ovJ0}P|)q|-Px8yq~)Zk>U
z7{Ug_P6)1GLRhf}n2m+!Rk+FVdQ^KH%SX5{^i)cYpyMkenSbuwxtg*hTvhl>BRr`j
zx#9?~N8=blsxK-^=5J_d_$`d3Vr3fZWN$C;m!FUKrHty2%e||{&1aTe357srCk7$S
z&&5VfRmf+6G<_Za6HGx*@9I0zdr#2ybOpZu1b+XNDC9f7Ch}JS->*Ml=$Mf(=Gc%Y
zA#UBs;TS(-8WyZvjty6D$Cm4E#JcS_VBNJhWBHEjVR!3rLN6Hyre|T-*koMW7=r7X
z!UcZu*xhLrx8a_#7VH|GjN3;>;_zG>jxWl>{SzeI9TSH=or$=2bSxg8B4O;9Xi;wj
zpji}PiYT}k9G`E+{>cX1+8Qc#8FBA;i_{$}A#D=wX^X{mbsnPr!m&>j-<_kQabrsi
zR*1UmjI+ZeqdjJ4I%96G8)jvBV07X@j5BzkJ;5C<v4hbZC!mXTKwdx}6h%0qG{z0t
zK?9H(&=*#3J7f#+vO)$TBf=H==1Amcn6>A4MvJIBt0=q6;G@z@FeD*FVCCb(ch>&v
z`5o8C^Wfs3o*o+T*m)5kp>0TDFd~D)5E~o{b3~*-H3|jM5vYre#)!x$OpA`kGJ^#>
z3d^v)qyqEPY?zmxiTN4X*dUE-X>$XX46DPMq1BjMU5v_9Bl1&C+B3PT;sVrGR-wMS
z5tStk7*be`vW#>z7v*4>gvDKzC0JZtitF22u&$yK8%u^@Lq;)HW)-0+J|0QVj)>|z
z5OIUOkQ5{#x2*sr`ErfId<-wmLv6YVWdc1;ZEMTQkWhCBuBooWe4_;oz5$qDTPG5S
zT@~Gc)3g%*BTif}y&D_d#vil-pbBMassYf{d3DSmx!TvKu`XpQT^WDuG*y>sNHnJD
z>r{DYDi4h+jHbVq{@zqxp3`4nzu)tkG<BZGOy@Bhuin(P|0fzh@0Fb2qu%-bD8Ev=
z9(;cUzn{f_l|RZqT)c>%e)$QP1bl5>9Z0m9kuGhgqOl&8jdhqXeF|1@T#t?0w`0xL
z9ayntJ38f$lHK<DFkI6ZhP%cH)JDZ)M?)~SRQqE`eK_u)n1$WrGjaDAEAAEeZL0~!
z^+Ut4vppVnhyvVRAA;KjT=$Mk6!^tsS4TJ=o+@GMlmzT*55n%TAvm)<6OYU`;m(c-
zJUlBM`=@2#=GGVqb>nc0D87xQj-vkjv3F7uZXO<nTSmlTNuC>Kq&i4A>wrlXdvuxY
zF)7U%Es>(`!Uv*0bRcTN?NJ}+gsK=9Wcl@l&8IIi0tO(}yB9Kp*i&ndoUlR22y;eu
ze4vE2$*@_BNHQkFl93_6OGR>uSyOmM*+#`g!&`vI1q7(-l)BRc&(BZ7+CYB<1_mI=
zKLD{I;YbJwMtVdPO5+nzogksB(S+eKv6v~UZbm{PmS<*RqbP+X`2|>2T#B_>1-QDr
z8mp=*F+DdO%gb_QzX;=Osc5&^&}>LXd0YZ2Bvh>(QjhY|Mhq!$L{&+tguj_+Dk{Lp
z(n5?YE5MTGI$S+sBz80o!}f|=ska8r0<D&SFbwyOM2$xzl3jyfjWeP!D;uRE|A&=i
zp(b0@V75)`+$b|8qC~><4H<d3zqS?YGYhe$Wu&y;AGIFZ-^S~){ff))xSM~MUkt<_
zwlW1J^#i7<^XkV<Q|D-^3{4LSeO|S%wlw`d8-p@^I!)gI_4i6s_o2TZTF-n}x<^%p
z*HiiR`DiNrkM8?_nyK%&r}Hx_=l7=mO?$dJNB=YM5qdve`a%8?I8){a{P6RSm^gC^
z{KA6}921G$vSQ?w6=A}(Nm#aK1=eofh&5ZbVZ!XWC~azj-SxwwaBUM|6C+A1QIuAM
zHkb8!2}5^@!n<`uGH#F%`1)Z6T-^|hZS^r&TM>#)wQ;z9xEVXgSa8=kQE*)b?44}H
zE>V9sHV0sPwVS9rKOCAZ5S%GcoER;6Vq|N?4MW4QrOF4_HU#0e5z(S7L*&>`dy&tE
zia>0t4i`vxV}^vb;|%@Kk=PgQiTyA#z8{80^+sJpZ`4E&KuMT@&A%Tqefz=cB_V~A
z9n9`_C=m6R6X5`pUq9ps@Ji(`fR(UINr5#h6G;L&syl7DD6<(c64C_*2L7h-^w8^m
zjwcGwSAgeF-~}Q$I0#XZ5r_;5f;lt<C9&~nj!!{*at0>ml%Oj+4{fF-OwP1mxrCrg
zD@w4ct`=)6t8sNfIkxAQVUfUao~XA4DHg0qv0`nq4cFR=u&bmA)8j43j*UWLaUL4#
z<*$~6za<jRm5Y~1-HoMXsLIbon}Ba=Lp_#DShzuy<Ko<G%nFXcwkQ)GN~^@Sm~@nT
z2Ov}GDiXkDWu_q~F9TWGsmKzw$9HDr8Dn4>Xpb45AvjdrgngC6@W{;h(z?GB=yfj_
zsu{T=&L+=6B?=S#u`1~B2d)Q~KK)9j&#N!1woKE5M0Ii~Q}ycW=RTbchsOKUoBo<A
zk1DU~?|ID6!E*ZRsIoM*&#lUHrSm<ZNK@zZ<@DG2zt#8|^sa~7m7Jej@BF@d`ft$F
z{1^E?<=;iX_ruRWp>^a4ID5Fl-Pa3I39(3%-+Mz#Gp5d-fhDU}V#@3}Xl`pqwzM6)
zdncse_KpPHI6M+JkBG#Ltr56Q)Y~m1VsYmf6Lxl(aow;4+%VjP+eT*y=u)w<CJ`&j
zqOrU*0-FSacXe6u(41V{KP6SlhhtNv2iA*%yL*g;!}C&bXs!u+Cdc6339-0?A#7`y
zcCBr7{@66cSHf9WEY5bqvOEtg$@RpdTrbSYaKltlcU|Ve7-t%Uu_=z|knQlq{%GX%
zve<#h^1li;udBMDC$LL#wL^yARig0hVde@d5pKvc$DzQM1{;@p=UdD~;hC)x(u%_4
z5C}eO4hsv@!dilkO0PStRnPIfv~zxbeiGIO2=Icn)mFlSLl7t190_L|l8qQ=&cK}9
zaxAVLhWT}km{5|BuA(fgXl=y$Q6sQr%s8y8YsP&e#^8~TF5Ek!1548~Fgq$9H|LgM
ze|{An%&NfN^m42c0M<#ko0n}t<B)O;DK17qK>^Cj%28Ebg|fn8RFoH?QouJ{!qc|&
zG>k6JLPwebbE4#rmn9c_<Faslm<gl&gHWFwk8+y{IRgKZqFmUF@yLr0MsAchGGcsS
z7}yKL2Do6ZS0uKkmtxn%8PfVL3E+Mb<VmAs8@voXB80o#>LJ7*wrbS;5$lcTX!;+x
zGJRg=Q=rrvdu3JKe2D!=rs~z#snXbJ^yxg-o2p;sQ+ZV<hu&0Petvbo`ubE^b#0c{
z*P;JDxX*3R`}v<@>a#0X-|dy0-<RI`o#_2{(EH_AT>9ZB{P@!^_(9;;*3qu5eB|Wr
z0^cA%1aq2Zh84v_Dllf^1ax#wKyGO{3~A{Cz5gG3{{i0EeV_T_V_9W-Veh>+07NAT
z5^Nwrkf;O!u=if4aAr8Y_b!ck@70oQS+?UMS(a?cj!T@fX`7qf<Yu!^lHI$zStoIO
z|L^yJX9Q1_<az$j`YO5MSNhI5I5-X9IUl_5ubp{9Z@e6jN#o!#+E&iLvOswC7vgGX
zCe<Cm=FP#iKoK4qFyMl>2*({cINXwr{SB$KXW6*FUyCOvj09s5uJ~w`TrsrDc~p2N
zD!Vc~F_MFa`ZDksmt6}a5t<3O*^`Zno+MoMrQ(b;0V}nevB$Cz3;Ok#)ybH6n=q~4
zipk1t7?afRz_=+6!<>dIwj!ucKttBuuxD>TN5M8UrQfBL?X{{ZFJl85Ds!o3^r)f2
ztF~0bWHO`Dpo6)}sPwUwhau<X<tacDz{SMG+=8B}@}hy4NoAK!(B<dk5qO0#=NH3S
zQKpQHGRb`(bhP8BKZxBOZj4fOEqYzp*VBtde;1AqPhg+JgU5n>c)f27uXK;%W@9@}
zl<V-7z5#s0*M;96o5l;xtvF$^pi5hZ0PXCQFMvU(7h%HAMSJXUw4$Z0g(|QK&Xz`W
zHa8$lz_vA-(PAt?zorzYYU^>f%z$g!Y8;eyeKl6hv^KzN)}oUN)Ss7*$9mfE?6?~Z
z8F#}`mIBRPpMiPfHdNfR1Lqg_^J?Wy8@X)xKW}2Wfq%{$^B2rt(w@ltYD6oT=nGup
zpSzi*DTrFx5WSwXxBDdc$&C>mYE7Pm9Pf4@s{2skqsHy~k%&H?`hGOj<ul5D)X$|(
zCDhL!{khd;(Z`ZwsGn0kC#hk+z0vX~o!`IpP`{hoQ$A~Ss=$j*6?$r_(EEp<^1Yw`
z5<lU0-P7NPja#>1<F?I6N>9Q1t(%aTo{GHE61cm%&^I;?eN8Qjbq1_^eol`^3A_gf
z^6~IcAs%5qJFVlgQk>~XRGN!i^yT1^KObk^IXKvqtW+xOZ%oAjdpZuaWa3C`29;k1
z&biZZxib~lx{~q2Y&l+>(-4{jqkRj`w#VV&!9qMVn2!hgX>YsIxqc$91+#F8a6Q<t
z1B<2&nAF^bQQFwy(tBuWw_vb*3&OOrq2f*Gt=NI?^6l^zZ9!YnHq>R@g_fN4a2IVx
zYvyMOz0WAmt*y)3ibh=)6`oe<Q!5QqWvF+Hs65{ECZmqabjs801!k(Py&ZUBYgOPy
zY%Nt@P7Vrk^HH2#00XD?qC5o3@~J{KIO=iXu+N8?COdX{y_j`-G3gCpsNIiYryo--
zE<7?hLrXt`8y#J^x6OygxXu&yX1w0mgfDlr<2hFwPSJw)Xi5;!S7OQAh2_uy2JCGJ
zG|}$5ThK-Y+1ApG_SP0O+iKw?^y*t{(PXcJ-B5<soD9rpwAf3aUboqCKY_Wg$%Z`y
z-_iC~ENYAJsJ{+ZJx18`HleGz5=L&L%OuQG3$Qr0#H&>KpCLD(06~C42>en~Zs?D1
zOu1O)0+vvxYDB-53s_(v7p-g~HKS^;3H5@G4p~P+ma8|2O#iuK$o^$LX`&+gR$--v
z3PxE+&LjFg6@2QkZjb2mQ*X%V?IqNGM2A3JJ;!K3N=TC#wN^KJK!pFVhkX9%l<zw_
zqrYEC_4>G#w6*{6Gb&{MnFM;>{NdQNZ7Vhqc4BekQ<D*&mWrh8Oc?8G;R*&}wAI5<
zSC4g1&+72VXff{X&BpcaOk4|R@-^-4SOqQxvT)j!imTxwT<k2sk(N}fG$vuOdOH?t
zc3_uH9=wx^J#{HK*qn~@-dt{<gBv{=cznD_f!lf7*L~JaIBJi_jqU<GJ*~&%lNwy_
z$;Bym5>B~NaMZ#5)NR9pWedhN>oHQk0ewa5F;Kn@-6dP$&bbH9%)8;r-2`XOMyk5I
zVWF+9NhRpyX=YiUQM#Mer!uoQqPaW`ZB{BgQzi6919Vh+Qeh#ZAy?Bjnkp+$R$8n8
zOu!axXQP1^U5=-!JW+=E1qCQ5E`qM45RTG3D$fFp8ff`Dy*NbZ&C!1D_WH56yI%p`
zc%UCMJ!1&k+YxN6#~{_&J}SE-?oK?|*NZ2;ZhXF_1y33(aiz|PlLXkV##-$6dT5iK
znBe{<8tO4*HARM$udaa0YDJ*U4Zp_=Pr!%jRvT&@b(HY6a5R`P+-$>fj~kDMdvU3;
z8K<c{uX>y~)>wnxRxLuMiD)ZJK%+4ajZ&Li*Nhgk9iBQbmPU5*u9hoPF3*49<nQ=z
zB;{FXzf?;0A~$Wcx{4k^Q-y|llS`-<xP-hf&%>2Y`RZ#4xxi&!E^PJUS3})arm{R5
zFtWacx=u7u)wtb`t0DVQU(39DE)wdz>|Z^X=yOufRYKi|EK~Ory}cT;o|@9^MQzvk
z?=7Og2TAq&ica~iqf@>|B?-Ns{QXbycR&3Z{+{y_Gc!oyy}DuB7A0d-lCdd{&W+&9
zDJzAotqrw~HW&!Lb@vSwK=OX7z59lWa4D39t37$l0^H~?!TCTgE(VKm)?a|5&MX{j
z&%u&47Sookn5~Y%Qe7&R8?vy{l!-(3G@S9J<9cr{9-An~74GYpBMA$}4Op&?p(0Gd
zIbRlzw8i6)Lo$KNgNm^wi2&t3O<U1dayNQQHo%{AC%OtZ!(X@+?F3nC)&{g@Z-PB@
zBdp1HqAKZ5G!cL;xp&gyO8?q*Xw3R7nu_A!lJ<C222|>_O7j$>wFX)$J_8{r{cR1J
zGUVmvC@L;m;YF)Fv9%vpTx2OJWq$<(UP(zYO3G+&HAUz&mSNbe!H7|dW1T)6_V}>U
z;l+ti50+_lN87!a2=$}a8%BSy7h^+x=<V`gyeo*Q4i9!ZTJYN33_chc$5#RYoUAe6
zU}F{T>j~p>FpLwvPV8xH#58T|erqKawHXuULU=S9I80WwHqhEqAxdqPt-Tr51fR`b
z1E1Z7aFs~`-xaqTmztXJfTtZ7+8eRgZox=nCAwQI@Y=0ttFK2#y%SDrJDLs6IJkI_
zSET$eH>x}e@+Z`Jf5)V);48U}<@S!g`Bk_{hkkkZwm?Tx8XZfB+L4|U1_SK{my7zA
zHg_uYq8l5_4X|g=9;FSPT=;+XXMd(N0+u#@($?<OsZ+`T0AdSeJK3MKrISY^sy#g<
zWqp~-czZJJlsxRZy1H6fCJl?FU7Z|Dy1+>vS~*wQrzo!Ic6#y*Up3UvEKf3%QD4+`
z)KOpLeL1d-wI`oJ-jkI6#u75vg0v}=_K31vo>3ybI^=Uo$PF#$^zSKD=tbv02R_M2
z1;)?t51b10#;0eHkeY^#+qYrkj_ueO6NArg-h^$*Nl2%oGg34((R~>k8?f$7khYX?
zx;9XZtNlg1Nm6mRJ&98;4;JD~ppf&qIN>e8DPJ*;xeBnqH46*2WX#)Au&XJXuM4o;
zn2DXW3E0;hkJG+PTnguNnuXnUF_<xKrXozla(yD^Et@fEpncVT2J^Mscmr}D4OEY|
zB#fJO5OjAVRI(Wz*>}O6zX|Swt#IaUK}+^#*mJg^A!8$IlJ9~oeLWho<T<s3UfyS@
z@IC`u9#vpPChQg+s`Ps3D)rFI6V0p^LQhBN>0qR-Eg`tl<yjYmo0Q|Ju#>3@Jk{0;
z>_p*-D$LK%R~}rgEiXl7aW3jgvJp0wVbW&CSZx(fb@$+$zX#W->`ps9xZF2@{e3+c
z3HUKcfQ|GIqQ5tUFqPSGPcOpG4tS_Qms?u#VrUT0y1EoVU0^=a*^Os{19-fv8^;WK
zT;^x`LbnsYcW?|Z_Bzl*mFY295TcE1r#)^VxSHy1ur<`8wXGSgREnPZ8Vu6b9`U*H
z5as=Krvv-zP1xCF!5*g_-8^=8T^)Q@8+^3EtyMO(+nTU2Gfh1wx=xgxw7dH|1%PrZ
zswp8Cry6o`in<Y)ND5@W`OR-CH@0;Dl3o^aqf74zSx<eYs|@`lA@c$q)#6Fd4jH0K
z_A4MdeE6_3dWmeSJ{wmW49g85apJ@Yl$Dh!^Ad6_we!D(>|6R+ORoz#hK!IRaF$RX
zHZB7tNN<tPfBy3dT-DwgGD3_TS3a*yCDca8q8_Djv}~^qS}OAbG69^tFBUe)?_WZI
zCk>=U6-%2&0lQkJr>a5u%>UjZ8hDZ_@S^i}=>7B;RQ%lbCxl<`&@i_0{$0Oq8@47S
zQsJdxOCqNfExDQs<Z3n0*=(qCv|`<Xj%2K~C*znu8^=3yaiBdNM?6$>4e<n8GWNA*
z&}yY(zas~SItsCyif(^f0rolzv8yEyyX<+`?<~T>j$*ExtpIGv7Ki2LB%BTu<3Jl#
zpd*VQEx_)EH1wC<g)#jm^lR?MsBsGhv>OpBz6+z~IE+^(BScHvTbTe)(N=hhw=0zt
zt%P1<)<)Pe)}uP*E|}uw@r`#Ae(TUAJu;{aYxCE`M!VKxsep~D&TKF#gMk{Vj07G(
zFw;nDTUuI*^wbQ3E#;QVQ`MbJrM6apCmCIiCn~S7uu!S(HR&qgAcQ>n5{yaT)s}k9
zH8tRD?;tIE5SOUxF4A_M4f=6nydU#@0gMF$80rdOq`M0vJwXg~2QkzgLcOsPUZV*k
zWd<ziYjMmQ#N%_j@cP_dJl8#fmk7$^+EP4V)!=t`4+5`T$9t20?5;MTx2g)A)@m4Y
zr53piUTYoNnQi>zHrcH3wzt6BT!$IAlRscxxW;q16y~-z3#KYcFsIkTo1TK&9a|AH
z=+NI#2anl+*}-li?kB6z69p$XBLOFjllze=bsM*bT&$vUqG2NdklqmLP*Uns_a)@D
zI?9R6%R|8>WIt+ORhbt6NzV+KmyjDkg^wBnPdT>Q09f{?KHOTiyWK`gzhD8Qz***{
zSB5$?l{_a`wif`(vDAp3S7ED$C_eRb%eHDy5(%}BuAGOg7d^6z3`!8)e>eIV|K38r
zA2sEBxIJ%&-al|E;1lQ#jE-R&VRz@IEx2pTHr%m!3pT}5z~tpaTU(2SygY2r%0w37
zw{FRvjQuVG%~y!Mw660FDVV8C!U$E@n1vv!O5ije)3vEsY|O?2t?Wc~Dt0+aFjJq6
zsk(HG5Lh#H$y}G~bACX(g+PnJh+#X14O=in1-OSmoUtbHy{#Ck+=1STE$AYwd<A!-
zuQC?hnjHw1?|>_RBfNy2BX1L$vp1kVL%NmSp_Jd%X5Ou&HT4cylV}a%*1?)d@Re;x
zy*3GrmLfD(8)4MSfC3hnm622A$z~PMo61pIUX0A_JR~M(AU5vfHhC)aKB*i}Rd@ot
z;^JZjn%dGL)KwH>knq@TtHxnU=bi$BzuiM<bz!;HflEVS90_+|f4>_m!$GX{_h2W%
zGwtiZWS|2>olcDOby973(Oh2(Z(S3Db*&ipb#t3>Jhtl~9vYj%C7%m>jYT+6nU2?c
z8}NfUFJALov1Ba95FuWjumjbJaWHP@G&Tt#Z575HZpE@TH`T*lPy0$0+ErVFy*{t<
zAnzypdvVHGfHS%@+-#`8aYGJv>iEOdY=J*L5pyj~y!*t~Mq25qsUYW9UVI!X1XN3@
z+CF(L01$g8uVv5ySywJ>ftd;bnHM-p$a?}k8LLi0_8~x2k0BvSNkAt1RF5OaR7X#i
zWopf^08+ZHNtZEIO-a9CSw~c#+ILqilaysgj~-Q)iTxD&E1}+uGX473zpiwPlXWEJ
z_)>-_@5#LSStP_Ns}@$)mGe?Ri+l$4bIP`B5efM&qB9zP%HM$N{FKw5@jbD)T|Irc
zo3PukJqDX%6R<Hh5t|Z{QAF{ht+OFlUx}EUY{cc~Vcj&<*SwuDYstk_eFny?smwG?
z)MjAZnt=eYKu^Dssx%Cmk}+UR;xvgWEDl{|+t6=FL7yQRgO)@L5ssrZ2^c0E2aO5n
z)5f7+$9Y3M`gB{lY#ZU2gt4kb^i=FXsB}AmOg917mPH#|uo<p`Eoc*aO2{>&--UX@
ztuFme!j8a8y90GxuQ6`}8gkd8F7vZ!NL!~o(ae~#4poKQVYil|zE%&T4EJuThS^w!
zD%zPUvpmv~pwbp0zc3HU>Dh>hmm#BW8#(2+_IS}17gCNVHN9eOMd=yJ%TQNZh_F1w
zCp{Gd+-9ZPiqq|GELv(X&M#QHmrdAom~-2(zt@A~{bAgko5s%8di0x0&{I{4P=g+W
zJuZxl_R;qCqlb2Oq;CRa;So#+da#%F>r{UimA4JMY720>sRWO<RN!KD0Vef%7&4S#
zupk#lYz-L6D@JEZHahdl&{x|Cx807G)+V&IHpA1_O7%w@TU~?YHZN{?eR#9G1z!)>
z;{Q3jh!3Z{xNIxJbYVOui?eaQhqh13=%N63i*w2lQxWB-{Nk=cL_#Vx<fc|FYjj8%
zoV=%wsv@DnLG~l>$-KHh>EN%9z%1_xc+?AC))xyZ%hl#BD#XN2%06UXg`R9PIXS5e
zFrbF?+ZE8t_UdRbq7tLQBl{8SD=?M8PNPG90Hr)qJzv>gRhBX@<MPSAq^wguiws&I
zujMnVWA;gi3Y7B1XyqDx4*%XmzKiI*bqz3o#_NDWPf>e414G!jEf$+%60kWg8Cw!k
zu{n|Va&Z|-ELF%O7jpDEq*at+U4K<F`l;OdtJBeKOhK41>oun%SecBVE(u-QM09Bq
zIiH9SGpLD2pgaz3w4C137<B2V+Nkcjsp@+4G3e$xJ(Vd4lyR9Zfw^57!8ig&?z=Jx
zzM>d{ZW|og8)276FcNZ2nfIVB^-d+NqUxmoEP-dHg|!fDwbCRd{~k2t5s2A@UM4}9
zc_$1R_n@jQ1&uZ%s;w4i^i+D5T9`~0LeGdAvj(QhVw4o+AT2!&F^TEe7MHp@FzCm?
zyIti8@Wj%J%8Le`6kc1jIyCaj?9u9#0YIm7vT&-V8kZUxaf;Bp(9ncS&KArV$}rtz
z!F+oohO6{A)#<{`PzPR{?!_xpVXQP85Vn?MH0;LI*cisgW-!z{fxh4nMh1t`<M&|5
zWygG{9m}C+9Petx`Iagis;R(iiviu`**H>EikF)_xLi?#$@Bs$<pQ{^RcL9fg`=$*
zZUV2<*$!883$1TGLbY~G^CEbV$Nuq@4|w1t{&3d-E;p87v?Lzk{1lv~62#wAiHNh6
z%2}BT^^}M-UHQe&BNZ1CauKQ_7E(qO5w#>2sSFk>qo0U%6d1hr+G`42R12vN8X)sG
zZro7HxnwLm`JSbWO<s#alR*enB_{j(!WX`vY$N4+qV7aZ$?Ir)D{bIpd_IAsjIJ!p
z{`#-~S^=F53?SRc{snphHnlQF%IIXoWeFJuOSY4;Kmn=>r@Fd2oI7_;nU^w4*<O^M
zy#Gn6RkgR$$o$ZuL&|q1WqzU-RogA^s|s)}el_H~iB9>R1bhNLc^>`X=mhWUWNeN}
z!sgf%+_g0hcWsNumShT;at$&n%285Ng=`9yb)JeicnLO-CJ7y732^elZ!b;Yd?GqZ
z65!_4SCPz2f~zDJ9Yx#GUPxf&ZH2ou4xaMu=&B^dN;kq!i|Vh4M~GI;Qyhm*&UaR9
zM?kZcP>e+%m10*#0)e>|Em@nC$_Zz|cG%LW>X?>9!Y!Q^mLRLkS`RB#p8(H9D{D!O
zSXo=n9caiQ90@@S!B|(8jHc>x+7KPI1_LTBR%og63`P?zt^pQ(1?^})a<bACxb28b
z!H)QpTS`y0wW`Wf%kfm;McZ0w%q+b)tORRoSta^328_`D9?@v<AnoJ}j&?j|ZNl>o
z4<2_oagnz7NOJ>@cC=y2pu;l3_r^po{^H(K_%BzF;h7#M_Bg9?XtD=0!~Il!{TK=j
zVQg>$A*zyKPXK*gE)4iu(d(|qkh2#1T&*}q^|w3NiUC~#F6(H&DjV=Xr44hLRG*a^
z^t3jhv)P7#!wyGXEgEZTXmcCkXt2Xw--<3u{>h>YyvBX}cxDh^3AN)~b0y}giZE-{
z<Lu}NK}V3u{r!)iw@N}~K+%swZemeAG8DQ%N<bk^QlyYx9jaXh1Qj5N+L6M10f0a+
zTB%9xu0TORA<9fHZmE$KXvq6g_9hBTz@^$-8F59%u9JCHtqI7aX4hym;=T9YQ{Z;~
z{CTC;Sk{x8RT<t&B6>VN`3&j+qH?aXTr9Y3uZHYjhN_q0uH=0=4`~;uf=)hPbeW!%
z+sOzn67Rh8j#8l_`<BlnpHrYG`&3n9bkmx(_~el5LcTMBp5)lXEaDO~u!X?e6q7=w
zmyRub9iLr*9Ey_+-n+SmN@Qruu+Cnv1I-07%vb_tJ8Ckvpek)M>T_dhCu3o!<#CoK
z5`>9pE#$hno6(rP0S;Qqj?!4P6>a5|_LA09=GzEG58>HS91ADm=q%if_M%OEEz`|t
z$&oSdXkjy`@Uk{4qodUFwZKl`R+V-q0ry#DzyMpe!0#@??+#QGdNnduUiKY4*A1xQ
z`_=h7&}b+`L$x$O)+s8_P-TVAC<B6;U@~$!315_#t2~Tbo@N$9zaOUz0Uxop(JJqg
zY^?xKRG!ofiu#ZXR)%7CR_HNhwqjRF8E%-W@GkA>2fiMBuBH`lxA^gTQ-?D6=NEg1
z@p5<o_tE;Es5ax(flmDKnf>^q!#nYMKW$}SJ5JB^V4#DR)K&u@;pA>?Mpw8Co!y=2
z^tQv}w8P_Rh2PtTu*-=-R~sffnz6IVhQ~X@c%sdZCp<wMt*ysQy%kF?2llqNW1-Cn
zKcVYrZA43ZE1K-ha5l8STibw+%5rpMrr@Qa5!`HP!#Q^wPIq~*yRi|Mrl#epir86!
zo$yUwD9Hs1@RWCNsUx`nRTZSFD^<0qN>IJ2)lj#MR*JHp+ar3JIv>4_gjj3!J=yO+
zwchO!Z7Wr<$+nW8v~BeA+o#dT{G`t&%c8%p+rN*0>X>Ws332PY=J&|UY-)M|$*H;6
z7Mn)3mx?WM=~z$rCFT|rdIsc?4JoB1gkCw;RcCCbV%m(VjIFR_Y=bd%Gb)odLYuH2
zhUAUPdv!V6(3CG=j6ox9ZGF}T=6V81rW-ijs6eYO;~vG9)@9sHm~DZLQ^`gF7*(5;
z(aCsswv5%Xx>Xx%CFlflwORKNx_7}uOIw{T;Jb&%-N@r^q`IT6&H60txp%`+vJEv<
zh>iL@SZQ6Q-JRKNfr-{qM}=qPFR+Z4Cu5ow7nUG1JsXMf$%u=IU#+c;w8{IV$_v%j
zs^xg1@C0}osyR`4HB?QlwRLdu*JH3ki+u(?&d~0@+2+N|Rn2(OV#mwXE%<V)4?i59
z!Ji#Cg|GIH<8gl%PSn-lq_q+cyBqLEj}LE-_287Z0W(bo>~c0>u8H=OzertXGi+u(
zob7gmI=uv+2QH5b?QR!bZVv)37lzy0u)y3;u<eo|t87%1tu5GTugCuO7M!3`InvgS
ziRLEw3CCtTvz2zY#fi4MMzl55p|{nJJ$9$kC-~l(DO?#J#p5#zcyM7Cucm)csxjmi
zR8zUk<R9iQqi#+ChkC=`ej}@|qi<$`p=v|bknL3vsQZd;_^jGx2^FU5G&-V}MbE4I
ziau94C-uD4^Su51qt}U6dnzn%KZodZmhII2sh>~c_G5@`zx}hV#lH|D@A&^Oe)bEc
zcjVmMZe(T^BPJn>Ha8O+wx{4Tn_>}@k%w%J9)%_g;tTSTTv~*6`sDS5+y>}VHp7^<
z4TjXM&?RkxHgTge)f0RMu473j*l0(snHyl1+EhZRI_<N{a8A{!8#vtrOS0I}&nV?^
zwV8LKiu1MU>j|sP1RQOvD7<V!kMOFJ3J9vV>a@?GK8H}0%-ulvQt74N0SlK~3CO1W
zO;nqksW3$u-bLuGM@#VzR23)EW){L!Q36AS2Gs;!RTZH}2nq1C#!3{HmLWYepKznh
zCG`_x6PY`Z5VuWvX02Lz@k!-)Vr$iMJOQ5ABC#{_s6`JI)4bP((JBLGt1Gd~q{V|B
z9$crjeW<Y=57*jpxu_h^+Zyp3p&mTl;l_hKVca*;hfCcaoOU<ioU<9vObp``t?S-e
zBVHNk$1A}eT(LIbvfGE&;zG34nBjG_z~AA5k4n+!^q|Y-N4L8ZU5++%Q}uOwofz>s
zvC`?m)xka-^|fQ(YQmD)fc;g~*hBl<S5(IDD-TuXO773W^S7gu$Diu)!$Y`sQyq@^
z+i_xi7*9<u;H@L4c~}30*VNxr<o%TJ`?=C+MQ%q)6@bw<uiUh9lgmvjK!{dQ>J5E6
z=x(=hDqPfkNkoI@_K1dxC@3lGlaS@n@KKMi?o&b_Dk1NwDWRUX?Dvzv7X2Ayf9iYc
z_UiQZkoDxf=zV?Cy7Ha;Q~OwpsF3fG-_tLD&g7qU*RB;56lsx^R)DzVJjA8uV{1|t
zlJiSZY_358-QV`?ENn_o!8$|AMpPzmVs0k*1bCZK9=9G9@f)B?*no<-yHOT%2lPpI
zp@#4h@Kq6BG9qy;q12GGg}~d0`s}T=pHzA>a&h`+2s5#xQvS9XO}X2s=I()2Dk^AI
zErg-~Pxc|ev(pw=Qw5r+{&WfJl&O^0Rb_IUoJ~}kge|RbbvpOYX&o)Mu_PU(g_$sD
zD_|w;Oa?Oy1|y7SBQ%v7lxoV6onL^2qzuFmcyaXlN%2WYirazs*sVxRh*MNvbR%T7
zc}ld(lX5&!j8ZmawOY~A(t?1KD$D1>va1E-)jCX?G`JcF;DoIi2Xz)a(B{Ej+Tt@+
z7M$RAhpVfx*V&5GV*_~l$UZ!@a|YK(h6p?l4m8)}jI9=5AjrPv^y72&4%}y{#Xf5-
z25V~&XsAc9qXQwYUjbf#`|fnO5%P4xNq{*CxsDDyraT=u)*Zx!fgbE@szWF-9!oj-
zxYuCARjVDr@)A_#Wxzv=-QVs&xQ%L$unlrskArr%jn+8SiTnD-@y^+6q=-P{A6B6!
z{|WdajX_oDN!+elqHk&yED|bk)c2zSbh|o=o=T|4h=xWq7^2sWo>#4;gbLW`^NWVA
z>`yX!dkI-zT^4OkC2s#L@>+dQU3UBa<o!?j3=+4myB7a+{8BtEzdnJUO!w_Sgt7`f
z(z6PYoRN=BJCd*^Aq|N+g(xNTq%vb`Mmp}K*rC#+qDvIW-Gj2&yHOe=kh>cdvG+g|
zcMqp`qICN@l*fEVc`$Sp?QnGll^t!Ul(Siq)+zS1DR&bbMcdF?v;}tBQ7Pk-&i~T%
zL`E?-6K2(fomI-_sPbfcsjR5aAuw}kp|hzJ)9!{o@eUqy9na@8(D8T%+HO-$EKIox
zFy<!0l%E1aVFvUS#e`7>VPsH%q$luXd_1WQ(^cvqqadfGrYR5iiH()s772)ti>Jb)
zt>p(-MnM)^OXy{$XCNml8~M3;C?qJ03yV-vR17T@o&Zm&yddzTJCfV&Mz7C@J;5N3
zbo#NMO6)?Y3)lPlaMIz#C3^?Xw{{>{T7rO7D(DUHQi-<cw3rTuaBO-C_wC(_2Y2rw
z_(pNiR*&npCcNVc;>W&ze6J&j=X6!LX{y0xs=3*gCiHf+!sBwlM@V+MJO~kxo$Y>f
zx&s8B8(wES`e|i%b_Q^git$LB1IH?jIF(g|`?AXLw6Pwi_`P)Jq$5<4hsBN-47Cw-
zb!PZF97-*3lY?;OF{DY$H?BQQ$rHID<zF$mf;{<GSs>RL?S*>ts?dtQ(N&cc4U%XT
z^+^#85p`Zd)>roty*~+gt?n;+8#UxT6)rNB3J0lBQI|!Bx*v7_(a=;wJx27g<a7OV
z+pNW}Ay%)$pZ`4az3<(-j7r|8#bss4<b52UoQk_QZ%15O7PQs1$S5gCDuv9Zlq9UH
zNZx|7#7!to*o2b!O(>4rsH8S!J1SFmK$p4=<+QpLNgJV0-v(pm4j41Gpf+zCY6&aJ
zy1Xs0NreRGt1{QaL?tN7td=0F&R7p)@@HX4x&!7^+FydrlyWEG_E{KH?xv!nT~6CT
zi+wj0<{c>Bu?_{>*73Mopv&O#v*Mx6NrEOf4Z6Z?=y}r`#M0_@sHByZaq*01lQOV?
zOm(!OVoRk$LaeLW>*Dr?$nu_2c|mKNm63_u>>L#2<=+BcSxG5$TCHMh1*B5bD9><^
z2g~&O{h0Q6aIh<YO9KOVd|@6B&Q9S1q4wy&2mv^VHiH&5`U*7g58<e>qOGY3gWcU2
z4|QX)yAQ|4CUI_f3|GUw_{!`~yx!=-hpsSw(AkeKG`8V60`%cPCstf-nD96daJL{B
za3kz?!Qa}hK+j8M81w|u)!~J|sfFNcgRk0%$r=kD<T`J)c=5f~AYLi8;1pHgzOo|B
z>q@YXfRmx71_)GNLnAy+2dqssXl<^=Sc4tcXLgIoi(FW%6Jm^0LQAeONiK{Aj?72H
zNR8V86%7k{PkldndGvbGDo}-{MD)BGDsbezX!ywf)b*uiRqZ*UE>n*cecb5tk@e&}
z<^AZ8bBkUU{aviZuQ$}MI5Pbup|=M{Qzf(%8pUNLNalULjbbN;@GGd$AtkQ>nY^bH
za&rj1ymXWkC}r7_>9?{XHxs)2EM_Jua#B&AolNkgLX($*itGfK3R6&3oPwItG$k!X
z$uQ&*P+2>m$=nV@P8>{mT#u>Gj#bi_OIym+XT?C5zJ&@d237g-Fz3cXo4ysL1Zyz?
zTTF|s%T9zYHwC(a3}^{EO;H|nylIV8cV<H+OoW{ZJV}|CEZ0;hDo<>yl-b=rT$C)6
z6yT{%RAf+4RpqI`Q&pZcp^!FN(xKf?*ac`~dkMSApdTxP-FRTnBAz>X01qxN;LK1D
z)!Q)kQjv`Wy$A+9=;`ufpr;$X{k<3+9l=;{KW2Idu+%?<S+@_n9PN00=U$}-_9>eK
z51MLmiFSQbTY|af8tn5ru`lF@uhs-pK^DKS95fNcUPlL&9u;4!3q39#%T0CZbE2Xw
z9W^-#I8{@FZw31Cx0CzvfB8r7WMvgra&xg*T8Ja9jqv4UBFxX|E772xs<5q{%CDml
z{Wcp;^pEn6=C#E8Ual#*{l(@g(31tL&{KgG4Ir5pV90yX_EwffLq^?JLWP8YDtceH
zm*=VLsc?}{;im2{`n~8-x0NO)>a%nu)aB9V6Rr5<eRUlPSr!dMHDn$2+@sfDi(gOt
z>R0?rEHCdS^h5+1lyd!is=9P}$X9wMN-GV>*Jx2{GNFX;uQTekFp!Q0+AqCUb1Svw
zWvC=T^qO+$%1fXnaI|HG(3BORyeJ3dg_#5qrDL%GDGTL<Q&~O%%4r4PD<eoN@-v_*
zAT&9Ro-Y@e5uV1vG?<Gsl&OYlt}H77rOdLN6ll1fwkQ{sJf>b#20dYC&{rxDl4c{y
z$j4N7D)`jbWfkSf%E?yB>jZQv^aOY+^rCxS+!{Sa8X*_Z{}-2_e6@Ls*jlN)kijkm
zc+zQHM*A2h#C8q!;_U1=o<6!l=&cZb^Efloi>ni~tUV!2_I6=%YyguZg9`M5-68Z1
z_G5@Pc(kuy8E{~Z;A`ivP_Rrx#n*>N`-gGf-lhyVFkEfGNV63?-3}ZIbz-8v7Vd&P
zv}b0Zi7K|LtQ;ZQ*|5ilA)Zsf*MSaq6O7hU6s2uNS7sU>Yih$gEdhMc+J$GT8*sd&
z3@0pRtnhOk(VH=sS%hPzW{g#s(Le?0Y&2uO!-)$M)1(QnDsn(>Z^24rd=<h#F1FQ-
z289~Z{!QA$$*3c0`!*R2Kwu+n)1;BH3Pb72B{k#Ho=@JFr=&@I=}TWy+Q~^fJE=sH
z*8&o0<0fib)|JM<(wjj>N|BHO1mw8VU|3RA*ntBFZmlbA{-kZ6jO-#kJ!D(;JsF)v
z_Adhq$a=C|Qti?4qaXc9QHb*BNh#it2F_po>Q|NPP+eEr0!l<fdo6x_p+Zmnf!VWX
zHw>#h(2)@u3WHp_wuIDll<F!`Naz(xQB|!K>kPDRl6rF_D`j+Hrk-gqOPZ7i=Tu6Y
zH)ExeI)c(tWrW#60GaiQtTY&P%2d)M#N|3hgFKgl%Q*d&c|9RpQ4EtNl7@;R!mR+>
zQprNXwU}7~9Ra4VjHoR=FEBYTX@gg(r16%$Re>k31$x<(umU)NUbM=KhMv^es(ovt
zyPC;OD3upV&kKGpO|+$6uNQs18TtsS{@y;!3=ZPN*eIS}*^4inIe{1UFXQIaD9&_u
z;RcsKxU`4^!vokiF@~j~VT^b8U}T^l0|R}$;RY}`GKBE(Ag3dk?Hxq0z6s6zZXEn>
z=AG@>*Wt!Fs=?Dd)}DY@=?lA$YV$%vGp^M%U?#s9)!7+vwcF9z<w2-7054TuhpPon
zR}+7b_=AxYi!g2eQgH=tHahWKXEz>hb1Sy@QiTbhGq&PQZ7aUg(2EDGE{yQI>#a3n
zuD%Xer)GHfM*263;1j*|cLbhPY21!Exye;!B%qN#uF{cTdOZBepZtl^NLY#pq#K&_
za**#;fGD9pCQ(A#_Q~THWq$%Yv7oX)Y2+(Vl;Nf1wfbCJnHMmgJb6-iEhCwzjh>}}
zv}`Ycl6^|ZC@%tF0k1Temgnt?N|c9jt6jtdgaU2pJ)-s%mSL&nVc{|_WyjL{Lf|Ps
zi0bE%8eKW23cP<xZLY=tx%dVDUUGf@{FjmIbkE-92n&oRT3m90EGr}EQZp%P=-x7T
zk7x28udJ!UIxWGbriNfql7C#C07g=vXC`3GgbGt{Az+v~1%eSNDN`AwQ+=|SmWoUu
zq^7{=SCa2rxL-4+rZgZ{Qs%jRB&pnZ@{$5y0i9M!3p57F2=rv%s=bx{>bT!hD%8yE
ztbYVNft^g%8e6rAiV8fniHZPEZc@3x8yjf@eLkhd{OIrqh6jf*G%$!cLhtP8D4yT9
z7jK?8ikA=U#Y0nLxZD%Q^}zu=wR<<N&CcS?<Rtclx-sSPW7zFMJAvY8sfWAM1%F>R
zLIVRBCfvsQ2N3EB5?T&~+8kKw4B=$YFi!OhVrQoxQ;qdFZfnG&%}%^z@4!W46-Efj
zuKH>W`r9!;@OArK2zPqn^SV%1TMdg&0}Yj|H7OO{MP)cfJAAXN8~3(*@MN<K-)!o_
zTV+kSH-+HQ)FI3tfthL}t_HjC_~H_278y!PZeLM)QjaS&xXSM;|4|E&Zx0y-SRE=!
zl%VuG?e6Zr1qA86AWBQXAY<E!LX&oC60*K@_Y%NJUsnN}^qP>@-}%mWZvCpuda}LR
zKUd%-Ju;*Zt862oz9;L-cB&1QeMJujppNDuA@9k+2C|J<Sa~g7%YNr~en(M+YQJH5
zUuyORkm|Xs=k`g~cP;+UhoDZbz@OcM-riMakQcPGmC{jx;49~kYyt1<tb%;x78jx1
zP>FRq1!nRr*a)x$Vk*d@)t#P7PE7-qoQ|p_^1e`24zQYpm!8Ya0yXBxAfpWxNHLXl
zM7i-Kbp|7}OlefCs!U0PrJBmKiu0CL;QcD}l<j$*0={UaS5#7r<kZNpQ7ZJ*ay&_;
z@?y2;#qA@gNSizXo<LV=kLMvwc*Dv~JV-E!$`iFW)zgEc%=^a2@Wk>GUO2oDH^&BX
ztuKrleZ6>kaRE>BhIn9h4#xvw>}hjhv9%2@O$BWFQrN5pIPA>`27>6}`r+OlrL1s>
z#~A79$7HA%yF&w5^3f){+}PRCfqS{`BizrE%`LdM+KO|wdYpASamMGz$zV6;I=tw2
zy3y(OQ~h<LsnG$O#fBDREgV`C<_O+<39}0X_}NwmUiNh1C8rO2@=LK$R)GbB7KaGY
z3;f)V3=T_SnF2S(-zt1tosnhono|Lf-0aboP9P%6NgyNdiSiR*sDpjVR2@}BY^yW@
zl4a6WOYEhnDp_CFl@vQF?@K>bQI4{{C^|{$k|xCs60&{t&`~nLfSiYfs=P$$$v)Kq
z5v0kAKv&&P-c$Fj&I=#~dZG|zd_c9FQLMO>^U8YinPu=)^?O(Y->*0168xDwr-gI!
zGc1?m&Ru&5J90n-k=}%|a*d++gcGH7f5jE0DAtsrfDUq<UI9=9M3EGbQ0WkcGB2YI
zi-lEetSpxlJFC1GNm*YWxFf08iyf7+w||`So}^SQD^}H1rGQL;rja3FR^g^2=;XD!
zOu(n6qWZX8thXpT0i8fk)SiH^pr{b>i61|#`*!Gw$`jz-D$857wW9K*{74j@C=e+-
zk{e3Ocw`z3bRpOoK#0>}TB0MNFwXUK<J!y^9^N&pJcD9iZ4FMe*zq9M-pf>hPcJOu
zDgkt<qZ5x2V#k|WFk;ceK`U>k^!C|X(M`n{9O&Vc@C$|!qRJbg!W;5;W1_1Uqh3G8
z9c?)5cHu;011{O>aJkBY$J^WSDwX2JW;^bsnmpDS#B_%TLq47-f!65_qpPzQes2(d
zmk(i&7o&b3RtVU8M@R5BZ`3c%&*5=e?F02zTyiwyg0%{lv<5suP?HFR8!s=pk0q&Y
zlvbc}g{c`80y(+q_wpB4ekjyX$BX;tKp+E(%HWy;4;i39Kq4LeB}7%JD(&FGg9^yR
zUP_1^RYzix!2?vqCa_&zURKUSJ+2Bqfv5~@puQGGDCZ}y1-4SjA;*jkfu5X`+E-T%
z+2()zkN=@GgOTH}#s51Y#`b5wjPw^$E=69Z3yZr^UZICpCoMQ+EX^9uM?_HtU5|(}
z9a&YZDaX1}DZ^T=sVt{8)XE5>VvV#iqKGui)j(gVRZ^OXNGjl3g`5I9HDx*1lT<-x
zWZtr{tEsfBQxu&jxrmyJ0F98N>eJ{&Wk#%RL=g(X)r=@R6?Q84O!Dw<&P#coKu=(&
zrfP4aRi0E_<gG$aDaYe?pwVejRa1>-yB)!B2!6s%M%V51`{AQKRh~!D;ld!N2MEA}
zt`3|T?!l#zKAh=r;uKZV(V7}u^?GoFAbVlwB3_-G!J9p!c){w#>x~|z;qg&(HG0dn
z@K@ImrXF<mcER7%g+O-*p{_7S2gfilG=^|@A40xP4AJ6_`#Z4cZpG;i2QD<#V{cgz
zE;ZNTyrTgZXmJnmy*=(WEP7lRB|wJ#+#l6lCvSWgEvuLN=nZsYUWT9#^V}vU@U`V#
zczmcI4+lH(GHv`9s1)CI2l1ObSAf5zq7e%#z!N?Q@Z=I(oygB^w6ak_5v{N!1R_$7
zr;b`86%SIrCh_AR|5%wy2(Sb|0yI%rYKML4Wg!C&NJx2|l(9+(hy;}C=*I#b6@UUg
zDbthdOv(!7Jk_#5ftNb&orKtJfu4l?z{zW|tn$7LlpvKZ0%B2>VsBMgs^=u3ZYLpS
zeo{fR7Qem_=tbI&{v1F3MWo1OVqzX8#X4v;<cf~msAN(c2^&<$swJN)Ds@=5bz2O!
zZI5NjGzM|;35cOWN=!;-Mn)e=Ns*C+(~y>yPH1ExjhRJYWDzjgdAZ2V&qE$PbADlg
zqO=O>oyBGrt#;N|`>%?9lrppO3JpriD^OCQMVYp8H3?l^B~Oa9CiF~Fu_CsYc2iH7
zi9MEbK!K(}RV=g$KNWfcEh*27OZd1v?~|Y>Dlac5S81jqDo;5tskP>>f~}z*?SxsV
zJFGxYM#u2d=DL(;&UT=KHnp4c3++zqao7ovPCPI?fEz7sxLMnbr=4Cr*w%p?p#UDG
zih6Zu6z}#=;RpUv{Q2-MyyptxQn?<d?M^IveVFMF)7}!QJwAj6!swxb><bfieS-+l
zPWyvF0=SC+^dju2$5fRWXB(UF7_HT1DzlyCxw!0a#>tL4Tn%^Nw6h7PJ?%Km;}5yo
z5b7bgNBhtj4xpc(sk^Nmdpf&tttX7<Cr0onmFB&zt$2VxFkh!4{C;Q<KiIWjx>H0z
zw+gt3iCBHT>U5)Rt%TUcef#z)5Kti~&^mJD$gLV$sT7i7p(I2Fit>`>Qj{RIq!Lm~
zD(XqzlS&bRp(v_9_=7)CYSCqVQE^g!sDkcxdn+}pQkf#_ZrHHlmI_oYs~WPMRAxws
zMHOhvHj)BEQFm&&qFO^M%2RByx}Mlvfv^Bk$`<AGuf?w~BGCJ}qV@!Mk~34gcu!ZL
ztc0SeTsTof(M4BCF{ja0kP#GYW)WLPtlP9b5gWF~<L*s6aL?u#Y>7$a{C3>6VT+P?
zZ`_KzH*Cc{8@FRU*V(uwmg_UOF$v4{TVwb-lFA@{+vAkn5|e<fv5D9omxLX0$%v8R
zoDxzImzak5<TNCuWgwaGOv%VpQk_rB%t8jUpfCa>0iINNm_;G#wJ;fEP*BQNf=;!x
zk}@xC^wKjk5F7t7@KoqUS74~nlO`&+YHRrmAS$oL(F!kZV1Vk#Ckl@WsDp~4ofgw0
z&$J*k<hc}MjwTG5bhz3b#8bg;JX_<ycRWM*QD_QZ>m0;mfgo;n1@Kbu5WYDygCBQK
z<4=c{@T2ZgJZrP#WK|tjS{>Nc<;8H&1%H<lT?AWis7q<fH`v{eo}NB<X`!9n0eJfZ
z=#cJZg=M&0=fHbCllUTS^8-|Sj|UyN>8i&2dq(ixseXK=w-e9ycj83QiSck7f*~jR
z2|s_k1LM3QXZhLA4EEsRu|eD*g!WMFUZo;`s@94p_+#^?Jd*II|64}QgETIYlzAyd
zQGh36C&Dx(r1C(8Ty)nib(u_6H5lD@RfVRgDpf&6Z!fA(Rb&z}uiDj5vdgj$)e5T-
ztwy7dCn4*r`;l|Iy<On#P?u0YlZ1M%(dTvhKGx#b5rRFr8Y3!Cew&vT4k9O$LaKn!
zE2ChltYVT6da2;lDvg!}dPa()b$4$`!d;saaL0xiCGXx6k9Bu#!5!<jDfyXuwkmnY
z`tA7a`WW1~Ar^OSVs4GYodn<=1mPV7;%7E+T{XGgotwBW^RCsrdvolq6u55KN`P+N
zuH>d|JFuB5Q*w(;xhy6&P63ptL}|7n5A-oo;R*CCQj1I&sudV&mXwtuRR#);i#+!t
zTIJndYbz>GX_L1)pnz;&S8r3EEaoBf1biL>L7F`{+uGnHpxgwW+u=l@xfy<48TyO!
z@DT0i8~z@=Rp-X<1}5>t&T+id<iWiJ%Z-r%yi6tW#>f=D+%<%+d;9QSco=VW_TWl=
z3(gRt$Ee6w20~cu3t%$r!)O<+EG^q;sGEwf8-am7wA1=_QU&^)4!8=-vDaY3Q!N3!
z>FdF}3$u7-z>hDCc<?tjjsiDM;WtM^cq!C|bG`=b9dKb`BmjRy6}&Z7XrXeR^%Hcy
z4x9~j;{E{7v#}XB`1vnVsb489#Fu7g73j&315$b>`4bg-a?kVr{>4utkcy7mL8QLE
z9Uih=g`5fz6-3ccQA5^O_w`ADxgB=$zPwh~lTd4QZ$GX~)%PUSb5+-i-iI1;4!57D
z?BkOlxfZ{U5Tz$7kJpS`HvCg8?>a%?Sx{PNfu^(yx(W+)+DJ<f6M3P4Pm1P5>3wz+
zmEG19+_No((<E%zk%~Jv$14E*%=#TzcMpN4<PLo1Zi4Y1f{x((?B+P!!Ia@!)Rbj+
z5}J2yN#MR@zf75D-o<@N-nl6*0zpxe8>u8W$pd;tVJ2W3RiR9`?1;nG9kJLR6NjYa
z6s6ovPwQK4Hp3znFH&xJ8}!8DimgsbOZ^q4r$X=cwt3R~LWQ1`XIiSO(9~jAhHvWd
zxRrE!JxaMzYfB4So9$?8Zb2IX;WAfYn5yy$ZRV48cDzfNy;awNH|t&aJVCUV_S~V(
z$3(ve7k19zbYCxS_VnY)&;VZR8^ue(0ld&Nj7NJ0aDhru+V@>2;4XLfU|%qRVYd^5
z0Uv^X5Bdgr5vCO#=<mfaVc1bs1DB=}U1eHK>5X`DpdT;vcHy1*A^hhH2l1B|58?B@
z?RaL$gZsz3aIoKvNp~amaGwi}jW|w7PuEy6VXMJmuS*$f{z<nVuM+03bN=z7a(rWS
zN{FIdUUKWptt|a91ba%UvE1)6;f=4}{Lyd_1r*&IBRZm$kibLWq5@As-A^>!B%<eU
z2abdauG^3MPk~7GuO9cGgO#i!0F6FJIljD>X*6ur^IC)7uR8>Ha%J*wAQOJ=`*xp#
zLDPh?!fKQkSHYyOgQ3zwi%S<sXDIeoLvCoP{MHHZKD$wXl)z~$p+tCXj8XEA4ODSd
zcy|zfpShdhyIZDA0#IPMj*3pn4V-6wX5$WJJK4uw0zgjHRQGx3YTiw?cb9;VN^w0^
zqcSQ&Y>G16^TzG*x8S!UJ_%_V*(fQY;^PUMj7EMCehAL1&=cT^J(V)Wq?BZ(Oz)q9
zo;sR}3O!MI@@rtLuZN@EsX))^>QLrgRC&?>w$)D9wX~w6#et6IR=5o&9JJNp)y^(_
zzTQc7H;gx|4m@SG<7RUkMk%Ems>{*Y(~iNQ2QG^d-Bv3WxR3h=NAc#~19)xsUc9z%
zA0C^T!hHnVW8q#rJ2Hj``1<frFD7YwyS?1LCxijQZkSM<8yUp_6{M@V2|i|TQzL;H
z!V}}ecx84N@2o7~gM&->+{^%;UmV5#ll@rnIkDJKhm(zVd|_%HPjz+UaJ3b))*1}+
zyFB4&$AeA}ZdO#{b7m`k(;LRuq#N4*CY<CxUro8p#8U8Xk1S(EgXoiPbOD2^n4)c~
z3Xo`c$$FopuA=89)OF-|@}BHNT}ORSLY7A>I9W#pRP=hXJo<dpb5{4ShI+2*RNY2G
z)|J=M>#xPHGvvFM@0#Da{7E^xd*L{$3{L3E8c<5bS6AHvYgN6%45^e=T9_E6zJSn6
z%PGNzZ3$Fc@rvyf1$OtQZA#v?F_L!^Xm=5AcPqIacizM0gw-ddlnq9w?DLaSvA@bx
z1tS$8m7v(`d$uM}dBzijiP#>Wf}}KpDz6Zw<+Py&=?Ox=PS{B~o?4k96%jdkxkyM#
z{6y$QS7Jz06{S3H6{NMbwMto@z)l5TOB(^_aw#LL2=K((`r2LaJDEKGZvJ{a-q3*G
zu(jcjdM5Eobu*r6cH*R^2BU^b^!Xg<>knXDR2pq^m%9VrmS#+Nyf`{OhGX;NxUy#s
zH>bvMlNR<$(2q-9KHQ+PyU^c*m7XBxXnm*px-r(*gW*sZW4(Qt7#YSe)!i`F<MiMF
z7P;=>u>q{~`f+i508cK?;l(}6c=gCJJhgit9vYp+YvVid!SGIewW9}L3-sVAeulH{
zZj2f&SZZj*l17II^%i`kz7;<V^yB^MMM|fiD3_J;k213gJ(YM2^~R3|Ks1D+At3WA
zFa#1Rcm#&g5Ry;<DpPfvXtfo6JT;=1-437V*XnxeG1dJ`$a~SR)sS<P_tcb#-luxL
zYTW*L*5dzk$lpM|fBrrARsWJ-|IWE%gkC$UjUA{cu7%ZNr#ov`Mn|g9lqpJ2n&wIO
zv~@)lW-6;B0xANd2&i`8&JA0ml8{@!Re|1J_wcnOmwlFb=f;Ry6XkZ#mRPJ`&AVjz
zYTiu+sOCMZ$N4PhMGcA-7CZgfO=61^6??rkArp5I!gp<!3X=rHr(`0psEle(DlVi}
zS9)QXl^R=to-|ca``k)%mDt;AZlyd=g`PT4XfdsD1-Fr|W{pivXmdK@>=59!6L@V(
zx(O(03@b{{Mf)d1IQgYAgU8-gWx|8Bi%;sR@SEWg+{bgfx4r?_x&4aMfl(^F-tJC}
zP^FD@_oAEN3K3ufp&<HbUBe+4M!P-OPtaW(9mb{6K^&#JyATfI3L$txU^di;10w@C
z!1tGeK@2+Es2qJ5A0hn4hA}rYi7~3eg~=f-jrHR&0s6q6y?EisDZFs#6kc9AiI=AL
z;q8%K_^5Xte=xL!-yWXE7dyjvxU(C}HalkO>ad?G_cZPDi+uf<z6M`kTIQWB&%EWu
zB@B>zSZpm{L>h$hk|P{c0H`;8^v(WBaFAuuucPNxu&B$_>Foefm&>}hA74UUU!6w3
zr>-9zvcKqk3G`I(MKAxPh(3RbXl1z;zs``q3I9fNP4fSr(v&RC9)qQ_9Zhvz&{s6V
zpsj<aqYIAKc2rhIDl??nN`+2~bqVPuR8}bj4#BrInE*;u;B%*x!>yLbiTaXqxH}0d
zNvX*Aj8tsUrV6+eWyX~CqEmsIs5r&W#?i)dJua8mGL?+1w_4Umi+wkbea}`&DF>8o
zcVIIWUs^^ktaXj3w$@XwN_pPLHMf#db1O<Oi8l83kyRwc-YO%iD1B{nQBqb6gV}(F
zrUtY+98B6+g3Uoi*V-nE&qDxqprzHJfUkoP?34z}9yf+Kf5hd+#g;Z4rAm6z?ZE?{
zb{wrS;c?p0vxM4ow-3F&0rUg{=<n%;m(wuSSsxW!x1Z1pc+ul`W8CM(6t~|+C>|IZ
z#Kp-;T$!50;Xdw%3hvTyFD~^4aoXp?TzwU~n@s3zucg}S!s7e{rY8omI5~n{6C*e}
zH;ao4JMqx&{rJ*_>v-$P8N4yS4=;zu@OEGr-|8O6S3M!T*xZ5VyZUjOR(z$!jsvZB
zES6W`sHqw!Y8vp^^gOR5@eRDNR<AF$##Sb)@2tgIe5xVO$55_M{`Sm_uRvGY3|qAi
zwU!QON^8(u??z8}6jc@*1(TK{Nt7O;w<#tE8)CAtB_R*@?8wAtx29r!Y$op9mV!IC
zCMhWp6c9x!B&4j4fZ}T<`JPmOs41_bQy`|oZ2h)m?uYAedx4-pPXVElk#!@v3Q$5%
zV7rl4Hz_5H>duP#20IL*_KXo*t3prgt+W?Zp?ABz73hi5Q+i<(7AT`DS6QpjOv~vI
z=n35HGIbJu?XYv&+~R<p)>g{(yaZm5HnrR5!7wfG9;&_-D#SS+_oSl<2b!!n&{&NJ
zySwnn_z0Fd2}fTCM!EtR>FrfUS#f#Y=p-~lVs|+WNP9r;cd#>n;jSR&`}%QeavBdU
z@5Qr64&lMsNjx$(gsbj$JjQLVI9jmZ*?<{O6J}|h7YMxNxd|Lvn!};_8SEb$!&Omw
zr_SN+OV{xBiL-cRVGmv&ox!ugK0M}dE8WjN$N2}_{kY=t;bLz$4tYCp!t24Ma1WMj
zjd<<+Mbb?a(kj~I!V=o?;*yEznKkIG#it#&L^!9@WBXBFSP!e&P4Kx<UR;g3>NX0F
z893YgWJM)(Mok2I8)NebyaH@bF2<(#Jlsw2t&h!8z<2kKG~B&CMad1E-x!mIdk8Lp
z-Flg)6%}<TDYd_9%4-#dvc7;$QiYxZwN>bStP&$&jE3HNX+KEtiCvaQHx?FGpt0Ej
zOO^FkpeGhr8YW9chEjX`&)8e(Yb!lZs%k8#Z?q9|EpQNQt<6$H%c+2m3eG{**U~D}
zHl+qvdSrCb?h3F52&-kPv_l>z4zxAnT*!wb-Zq>LxN$WYz>D+q_}sn~T<jmfB0<;J
z?%*$uA09$5<frZR`!O*%h_QhI^wR$J_t5&fy%_a%VJEHU+5Qo{ao{Mvu=fC7oSDT-
zL!)@p*N6YGd<>r(9K-#+gx20=9GjiOAsNbQY!t_*r*MSOyLazCJg{#+o<DICPpurl
z<-sA`J2Zsr-90!%3;$quHy$BepPya87mgmsy>nAkl!LfLyS&fs!bo)uuI}1Rz(t^^
zTvw~VrfW)XE&esdt)hoj=oMyH!(7>h`Wg=^D{L_88W8RpM}O}GEY%ih4HZgd#=0HJ
zW!Mm#kBxB!*pgU;jS2a<J0=U867#Sjo+>adOUX^ExiKbPf!&7K3{KMsKmi?J??_j2
z{dNIk1bhO#^#rDpG9Lv%6>O?<jHJ9)RbRYPo_Eg{!f|s9Hf@cg+RKEEO0TNAmcWbD
z+N#i#ay{t<l9rLKG+7bYiN#f|t=dLUdSJ+)0(JFuXl!a+ZI04{mWE~}Tbrb&)&VD>
z*VZQ423JQ1m7a9w7lr0UzsH4z&UPH`_TuzF2-hZtadBt>=fWYJ_xkbl$OPWrwGYot
z&ft7+4`$tM7$6iwZU_3P@Orqf5Fr}w>Vlt2u#4OEg}SjYJcix9RB!$u?j4`Ta|^rh
z_Pzsnd2k$Wb&cS=lY8;b#4fxtI)`WX?585!he!7xP^LEts)zUQ$8#r7;@+K0cyvX;
zcN~we?876=yKrNA5*J4X@$k|-9$8w%rIAq_<GCH^@5W9-dpYRG-cSHz{1Ldke;=t9
z@w2O!RfKYUA?%ZPZY^JnwfL0ccIf405NxH5RC^t$F*{IJWI;z;4<^T#l-3!QCauyy
zd0kA3hA=B4!1Ay$z6hHWi#aVI^wiA9CR*Q(3IMZdg$co!OhS@sFoxjUPSEYhP?kx`
z{N040fN%YFfnl1GGOsGZ^#a9JMfb67j?M@SH%f02f=`s*27+&UOd^SGLv3B7Qi~f6
zJ(-tbr82TI|55F&3O%XBkRBLjvzcE)0~#8e(A?OJ=7uIT)it1{z7cIrk@k4fF0Vz}
z<_Yv@L46({{LC(IC;I(fjC)(Kx5temgx=W^s;P+?obiTn!{3MJhNkiCz!;t#9mlnv
z5capXV#?ltT>(Fq0znM6yU^R|M^~r|fxd3|!a-#um;O*#vAd3%DzsYknC@`ksmWP<
zdFOt7ZTBHO*EfOZ2d41S#3J54cnV)Wae;YWv9T}hKZLJbxP*6Z+=~|u9l`^P3wY|#
z0laYXIG#9s05|s#e7l!$d0`Hx3BWURv)J9=gT12z*g4pZSt`-Rz96PToj7}Fg%?t!
z@u#pZTGgqq*Pyo+pLRqlnq&r;n>>WvbPG!IYGKs2pw`lk!dwFy>O7d6*^4%3J9LIs
z=*6a1B0j@_t%>Edv{ZWW#n?*qw<#e4z|Bdty@`cXeR)_PmrL+5V{>jnFk*#84Myhg
zA^7g5+FQS>`b7P0Bpf9pDljzyK&m}?HswaD#!a-!GL>Zly-20T$I#ofJwbuq=56t)
z&>9tsTPcGNtd{2q^kk|G6)yvbCMEtOdn;}9#M(;d_o}LDg0r6BYea*M$@#`wn*u!t
zA?P6Z>;$4fPqDT>A3A-V1fQSa^P`7qZN~4wzK|1#!akfI8pNY>J8{Mx!b729JU=vr
z7se*=^xy#Qb+zM6V?Fj(SK%C0-AQS0*WtosXD9l6RE^ys^n`-wrRr;K;5pS=P*YV2
zYqbV`n;AQ;b$GOU5T9FE#w*NMm-gdJhfm|H$1dPYQc<z*2;QU>er9nu-a2_2@7}n9
zw=bT@(*)O(hgb0Y*^_wT;yFBb={z1ga|#!hcjFkrw`XDu2Nq_P3XtWgQ7nxTy8aG~
z^@VWsz+PTRqUk<{-Wu%I;$LGZl^MJ+=ckTP=~XEfS6kWui@p^FIR?~NoS2<nMxZMQ
zO=X4BD`Q=JnhD7{HHb;oV^e%NHYZeIOHu`4$N8kPNG6wJlfaL;krsGELII~tT490V
z#(06@uR?HByr{y6@{{Q%E)(zx1l96E$xX2WMNxuOfr{<@xJ*yAw-M;YD*b-5a*NU2
z;#5XoQK6@%l<P@lMq+Z3QkfyZlS&LpDbJHJ^Q48gh0v=eOsrIQHfBA6C(yIo8VH_7
z+FU9=+TJ!ttJ0ZVpy%`X;dOKB=~UX?9q4l5gvX9E9w+V_=)+^PGdSh(;4&@n{eb`;
z9t`6?pA)ar4!_+Q#A5{LHG31zd)jfRqYbCSy*M1Cz3lG8&R_`rJXXE@5*bQit<|Bu
z$&8TAghgAeG9=X-3%l|1+%7ykDQ)}a@W}iwJWqA^)|vD8(#1=7{@_8pbnFPee*X=;
z`{)Ds+?8|q{QcMQ_G1s?3y(jHr>|eZ)#FET;qYOcKYA4VXn}Xn&EVwzz1TN9iKWS5
zOpWxRw>yASCy&X`s6se_oWO3i9)lqZW$9Y1#itj)(!P=xra&)0vl>NYnXar6=E@cn
z=UJfFHezUC7K1~>${3(Zla;ts3u05v*p^g@9jQjdrke?@3T&eS+>)#z^a#L|3T#c&
zU|YHtTT;uhi4fed3O^;|^CK!y07xsmIjMjuF;9V@)Cg~k&8CIUxdlEc7u=edh0XCY
zzgjs$*xk8Wp0|Eml2WUyRAj8y-bw@M^_)*g&O~EN+b!rtTU_atA)~!WuMB~n3Os?I
zKu@){0=+6iuZ9Y*P88oN^qOt;u+#Q7*W2J|wWF2U*5**`t%vaP5PHEt2$Qs-XNP)m
z&g;bGj&?joYx*!1)9HW*#~pTD@VoHX{3uSg*5eg!^ZoHTyf;3DXQ=KT?g`==E#%Ff
zKHMA_#`)ep91R9BUR#ThQ4gn~909AIugzGYLOf5^`G_}wr~8NT_~-=oQh8pUp2i!O
zF5`>WZ{VFr9>G^1yooR0ypAv3yoL{7coOeE{Rn>e)@%6jSKq=nUwZ*xdj1(aee*tC
zJboM(j-SMl{VUkNcQ1A=%wsQMy?1&F)5HDf3kNVhF^mHT_D4v!D(z$<)fxi62n^St
zw-%pnM7NQY;%9+gL6(*FwhsCVyHc)KR#=0Q!fFKkL%go1V5%}j8Yd^DTM?I5jpS?_
zcBGgIJ~QGd6XP<gur;NUijK-JRY&0Iu{l+XO$4Ci`ot1~kIGLJU~&o7U<s9Akz$2K
z0ZMLWN{#UP9aMkJ2>i0KF^<YFK9e?CRece)cMoBws64ef3ss)lXj<Cp$#h3T8mgte
zUTvMC?4*IRs`8|VMsaCLq$^rnWDrpmdeY}sMpIFhp1GzL7J^R{UtOJz;H#tJt3#uW
z@M~&7OKTGqWg}c3dEicm0zNOn*Xi|Rs55|l-C<nl?!lws0X!1u#?8Tg93cdcxZ7};
zHgS))4TF^$oT_fb6Rt2G3-;muE=~uB@W?<v9v>RO&HgaX4+L>M;KoHN%9m-0&)KOS
ztF3sVt_5Fg_2X-fE_|WA3lICdaMI_&_4!FWbZ8%5zH$j~+;=a&^vHww+LMpsJFh&4
z-~Elx<Bz}jW&Gj0U&il$^DFqBcfNw(dgrV7;O)2Y+EY*Cp{qA=@#I+?KXeTHmiJ+K
zX%|*z=5cU#9(!h{F*Q1b?(QHC9Xt?`b&@2WAWmv*Z$VFOAGH>1@vk9xjV{feKzU&k
z%(@OV)C4GEI#FwJp*X)9uJ&$BOwLf~R7aqflwm_$sul6+wMfZnKvGs6lCqh(HpFMu
zU`K`-+a)ti*p^|$=Cn#|PS;^8b8~71Hl~yjfW;92GQ|RKCj2C~r<GATmScMgmk9u=
z1_gW@Vx=O4)>mqRw|@-14LhWggG!GoZv&?rq@shiR#Jw*x@+SOY>7!mrP+#FDl)0D
zReNQK$`k0RBdthzo<L8^^8|b<^pyU)W(%Qbg{is*7EWudwMtjCI%We^UK62bC-hnz
z%>-UMJYE;vu6CuZo-|MnQ*BMr^6qlB<Gw%-uD7}HNM8>XUI%8)IvneCW7gS>prI0h
zoMQA9XfR*fi2JGbUR>Hk@D1ack$${5GmMAFdvI^BACGqW@M^dluhR;@6B@t|{e$?^
z?rHqry>s}Zv7LC8+dMiufyWQ+!Q-b6<MGoc@XC#A`0^8v;=Pxi!}s6%9RB!QU&CMi
z@I(C7@BAkI_}lN|x8MCLe)P`Q@V&2p4PSZf4LtqOW4LkQ8qS_Li$ezvV`cYV0&fuq
zr{}SIatafJ0|<w@aA;-a<7(O{=qY_0)}Xf*|B6B}9k-y@3`0d5YRx{Sk#dcxgV3u%
zhf|<8OJ`Rb>3f@3;zC-ko#3lQdY&B#8MR6#5`GDUUo645jh1&?h6!7lo70WhNbqf-
z+S`y)j*SH0CZ+&zGhc5?q5_mk5GuczOf3>~^@z`;HI{aOakRg&w7Ft+rOYojLjj*C
zzD;qW`XbO%fVT?0^_!(na4aouf&x96mS_y9Z{i0e>;!ZGPeHK0lA`uxs&+>cbr%gi
z>2s?-^v7T{QtDPIPl1!Eg(*YC*VNR&My1ze7vRy}wza_FY@x#IfVZO!ogGf}cwFe^
zd|#U#GkouGYYPt1YCp#154X2sXK_AGQX$SY*)YOsC%@=mV-p5iT5xh?3{NdB<E{M%
z@P(ymJU7sZCpz14&0dH5J*{|@D)G^}I=t1|f!_)b;lEDq!vC0E!5@w;;9Cnj@%-F0
zu1=5Q+MWeGb^0W}_~1kM?rX2%cfax_{L#B#$6x;7BmC_j{T_byXa5)e<A?84<$Vp`
zC-8pw-na0rFMkcMJ@+ymy73?`p1FvlhmPXFo_$!ETEPB^SuBl?VYIggU49=9(E8pE
zy)}Eg7XP|JuEEvX+g0e56*NIx(gKsV9n}Vx(igWl-wKC4h{>rr3MHHJOrCX_d2L9|
zwj(jUp3^3X@=MC3`b(=JtZEUPX(jY5*phC-_N*#wV{R73mtnw`4E<_Wt|~tbtu8@G
z>$`;ta7Ts)aalU-NGp%D0VMP`#ph#dQsHWupHzBeDA3y&E0r12j!(++sPJfS@7WZm
zKu>ZbAt+ti1ctdq6=-O-!(3%iz!#lTgDd5F0y@>+3izbCia<|&fVVskM0u!@fRoUY
zvOWP{odUgPrKg6o-GO#zD?*(f1l>+5yjJuQcKyvZj5SqZ*<r<4aT<=9O7K><8*lj?
zc&yfl>&(4o4QA`jT-S=lzAhY{9>b~eQJkhFz7`JQslhJ1J`%)hK@Yw$F@%?UJ8`|C
z8ZS{<zD*nbdt=l1{lQ86<@`SU{>(0XmA_Qa(}Ie<J-;}OXD^(`S7}*){AF6%ci+K(
zc=sDrci+N){=xU~7r*l({Pz3r;(LVNNAG+C-+TXky#LNO@r5_v!lREofeYs^<1nGO
zpGt4<)I65Qr!g;2d!uFV@_Dhcy!?;stpHDfp1fLv-dcR>5rLjsd%JV)Buev|P@3Ne
zO)=S^YlpegiIRdkG&gh-dh@U~HWGR!tSi-YA-B+lf)Woha$1m@)u@0^)Lv3{J#B6S
z!A1CG(*oxchRhu~HOSDmBDu5yvANX*qMkNa%KNm~k|yPUIzq1;TT{!ijquwlwZ934
z1fDd2E~M%!!j`0bY^2)zm!PMpJ*6f$QCTL_^qeBpH8d(C9jlcXl4{wW3|TLr6X=QJ
z6Y#0fi#)`;QW<hy8FpT5F(GI&nOB>#*w9E7DT5HWsO&`H4Td@~;Auy<y&fKu7JY^i
z>^J7(s;vO~3OC_7w+=s;cH_sxZB%H@_=d9zZ+A7}*{)_Bswe!cWjNl|hLb&B91DAK
z#BIk(yBSXgTJc9G4g$|UiT`-yAigy<j<?4KsOqNi<M{>r{@hM{I5>v4yZiC-@E9H(
z9mRp(5Uw2Dix(fbiLbx*Dt_y|ckw&lejh*hCgJw(*YMr<zK)N+^$mRcy>H+Hg71Ux
zP~}needGQ2@Wrox6;HhQ0?uB!g2Ts-WB=a$*fYC;T~jld<GzP_x)r6jB1(@Jl9IwW
zP6T+uz{mnwvKDLcX@*>bA46|x_9TjP8c{@MmF73VsBtRbD=U)L8D31$(KIwS)810)
zW#u}Nl-7vCQZI4}oJwZowIC(C5y?`<N9e_8+Yp~i;N{gJCbt&bvulv95$H7{Hjhd$
zleRZauax&~O|K;MDwP`FEmVEd=1&EmsJ%!=lwLISRC~K&yNsuoq`*y4cv73If^aqC
zQnFBGtwl|(^%m^ZsoFC`8Y2tzq-;-xUb#9-GC^0VEk~uM9C|{~q?h671$s3~`@KeK
zrzaz;IGu2}+R^Q5Lr+TsLbYZ%OLDPf)!@x=1HKuq!Hdm#c&FQfKi%C4`cC|E-w=M^
zUx#-bdVFuxi#NMlajss6BlarnZn5HU$cNK?ow&Eph1X|#@%?>E`1#|H;J5eg!RPzJ
zgj+Wr3%KxkTFs{&ZFnIV#7(ywPfkqW{J;S2TiJ()&!52?Pd|xozWo+{{NcCpyTADn
ze)Qo7_};hQ!-oXj``>y8?|<-3eEXws<2%3kA-=`wH>d)?^4>dm?)BGj@xB{4eDWmr
zuB>2*aGjf)!o<jsGE}|a>&Ei#-Rd=^q)b*t-|7NcvKDLcX-4GYyk&1^P7-(xD9o~<
zIJX}9atADxE@(=dVXf*=px4~m8c}*hB|c<iwNf3qkzeFOc0TQHW>W-wBxfSYnUGVj
zKrbeTfXkw?%dlX3PBpXYR_@5Q5Pn8Q>1|2XVrv>zp8~ydY)`F-fG>%*m(bgq#7r)r
z`ipdP+Z3-hUrEKr9m&|l6z~c3Hmx>a`M95MBDTe+!C<aJt*!1>?X3zuNvXk=E@z_h
zqM@fgoJgnDKwDOf%JLEzId9hMQDu@w!B*7OHlV4|j#hgs+FM)D-e5y#lMUVVHRvbw
zyrsFg(rL$Eo!f=K+!@Au4jn%9RpY-M9s}Nf0yw(|f8edh54|@0&E6^e@%$)0-{r;?
zuM>xS9$cKB$AeT-=LdbbOeOeqR|j702@q^8cqrt-vm?Fu;;vadLjXR}AI2jSgE$fL
z;`w8Tae87L*9ode&Yj2WPd$lmeBn)e|6A|jN8kB2zW?nH@SDu<edhyw_`&=5kjp;$
z=tF$>yC31(-}w%{MT`5+yYJ$)&wUQ}-hV$1o;Z%31nA`4G)Biq(BI#Ku7FQbw>x((
z-MXw)!6z9(UF2e0i?#T4BXS+8&|927LEzOZ&@0JnKxJ73dfKuUSd0z@dNN#GCB@CU
z!creW*N&{bHssUdW(xFZb8|{uNH3t8%eNz?um#CQcElGn(%OpBvvO+Xv|1_S+nQ-%
znz1#5(4*D8WpPtA5u2;1zA~kUhk!38tqj}KO0bz0cT++xHWPl*ghgqnOtmK|N>7@u
zsJ2$Mw(AMKt+6T4P^Q*3G{90#iz{Vx%81JHFma>uT#d}EEF?&|o;(gRDN%V!TFL5D
z*)#;7wzL>J8D2_T0keV7Gh6tJP=f|*q-RE3eFHqrP4G9@qldP3tfd~44K?U0%EsgU
zKK#zE0sPUJ8}HPY<Hwy1_>0*d;NoHY!^|-LiWc_If*ruQWB9v+`|!c=ATE1dIN<NZ
zN}vzBJ$@VvI&mV<jEn7!xYA+ArH~uvyS=#5-H98tuqWJYIM)}#ZbI?M!~o7N&Eg`}
z;Pt~t@c5Ohc<sq2@zpoqz<1w$2j3^;zDub6CgJu2Lhk#_?|t+UKK%B#mHf^J-^K^u
zdLQq8;~l*I>TAl=zz&}j=<UVC>?DT92GLExb_Tq(^X*vPxl_5U1axXyo~rbuYnuGG
z7Hje8M%0flCyO&G^k{GMnxHFfQS7a*%+C9}71Ps8%CK>jqV%$gI*=i@w#<v{QV-G!
z+mTUB;0g3fI*?l2jyyvb3QR#H6}8gd))RJOb8Xn3Rl}@S04NG@b9$t9S5)8jEF-pM
z(CRYPn%(UgRDUU@*p^Dwmr{b*3=Lv4E2#RUyieNjQT2)4Rl27!MeR}D3H+jmqLLci
zqB2dS2A8lh7)>zo!|1h@q`C<egqn0mizV=4k`fS`l7zIJEEJcO5{OiLC8d<TCG`6R
z&}!sazvZ;HX4Ep<tX4QJ7WiqMyPN9KYqw#<LFm<2VY#&d2b$_|s?La~+p6$}!+@_j
zP58)V#b5S!;U_}__@A9Y{O9^+{7pv({yH4Q@Ar1&6_*o79d=CA+c0jf;eOh1gn&EI
z>%rNe7Z<t%I5*OV6T`jA;GyToMsaa=3dg6%v3GhD`xmBga?ehjU)hf<gw=yrF5|@~
zp1{}NdJ7-D_Z~j_*0&<i`|vyXO{%-^efL9rMB6J$??W#84prZG2*h{Zc?Yk)@(OO=
zybq_&oW#<uMNCf0Gj{uw$4_>JI^m_Ym$rM-EJZFUX{sVaNvTa%1b`o_zO`73PbWSu
zVd2%td8xfE&S@g_8Wr%#*n0YkW;(f6I=TkTPVYfSyO*q>(lgW!p{TM8Sp;5Ish6O0
zBb6XbCHT@fpThaHGA=9aKzx1+;_{lQ=xPy{+kn^{8+K$-@d^C0YZUnH$gROPLT^W|
zs6RbnsOMD5`m}_ePU+1dHM>&Qw<E0rG3jD`r5&FPag~cL(v(F8DcCNfvP1?GRh6Dv
zlPisuGxLg2+t36vEvbRPGwMyK)aXccBeWG7<QEhmAvp;#iSdX_PE-aWkZ16g6_=vC
zu$aFF`Op*<Qt6fRODTrcpoQI3iIB~T5yEbqxy##*c~=YOsK%DvtvD0(;hd)fhs>3D
z!cmJac<S-4&xUW=&G_TCR{THy5dMcZj6bby!JiR)KdQFkOH`uwnoOA1S0YeRj=8#e
zJUBCpV*^xkBLv(CRouuRj*SoF;OGzzjf~*f)D(`+&SL-U3>N37uzTk`j_=!xvxg4i
z{K;duapfXje&%U><4a${_dbxayl*SO6NM+x`|d~Ip{*tOIQ{mw-^cs!zsLL*zVVH3
z;KdhTz`fV6<JiFiSYBMf+~hch2K(R*cwld=M^l+REiFuGCnK!<QURUXFj-*tvEo~c
zwfOWxg`V<&AO(8ISD`1_K<G6QdQA%SbY=CJnc4%7I{=f(f^}v3Fp4w*WEHuPMSGiD
z(TVIbKQd{1v&wx61XGII5uYm-*NKdBH(}R^g#2b^lL9`0UQBKs5{nuUUuaXH7n4_w
zEtv*_(4;_bI{~;u;FqSs4l2HFRDJ?IY0k2ps!wWkw^Q+LrPWm%D=Rivt<)g=1b9-T
zE4?)0QZi6QfJj#|T_x=;Q(&i5Qs}fODl9-^TpVJyZbj^ttw@hcKuLBkD)OlA@(4Rl
zErms>DJwyJMG3lVt1w(wjVUU+g~l2zHCb_@%Z*drKAi0K<IG?;PW6Uy%;Uv=LhOkq
z8~(%eFmT}*{y6NzpSnHxb5|#RKDCJd+SQ9cv^((KIvc*;*o@~}X=Ry{w9PYAQpX1R
z3Aa8RpBkp3o5UG{XAi-%(ASM!0|VGKIE(`X<G%TMEG^Ar&)!`)LExP|b`<BR(5_v*
zgr^^W3}1TlO?>d~yZDIqR$%vo?|oNMc}jloBL#ZjA@sie!MBv@H@^NgJpbIYxPI{>
zj_=uvBa6E*KQ@lR-X3(cIZ&(9qcy(*i>@Fk7eU)CzzOXn*J!sE{|e%E=<S?4fs)*2
z6lF^`D$p}%=ma$_3iKw&ccPO&<_h$(3p!9-(S>}1F1yH$oDv^$N;{EL9*Dq?AWScI
zBQei`^inqp3?V|$iKN0-Bo^2e#TP@n8<*ce=rtj}pk7gX@r7JgU`0x4J(VF9A8qb-
z<_^Lymg+Bt$}c8egYBsTy;7yuMoelkwk73Zb6h4BUnZyN*c9`z%~gQ6F*2I+miQzX
zYHF47^)y;NVW(G`qi9!~qLdWmBV|Vn64q}(!kzaZW7AfYC8t4~l>=jbA!<v?P*+rp
z`oerTOA0YwW5T4d0&~VP?6p?nsJ#aJXk&NR8ga0#2}gWx93a>h8(VRd`J&s6|8ig-
z@Wey-ZwoW{zr+3bi$Fhrat`2sj?Li@+CBJfw-@hrc<~Tb-*SBuMhU=hjTQcyYD`dB
z&G!bW?uM~4GK`tN9?T5(W6#7Sc8^bDFD>Stg+(mw-ihV?dvW~85u84L9H-Bm#-+=b
z@E|SfE6+cVuYT!my!XyG@X@zFP!yhGYncK$HKk(XgZI9PFMshZJn`rwxN!0$PVU}|
z6FZkNH#&yl-d?z!ZD^=8!dGs={=gui7b&|F=qaERsi(@lwOET!IphJ~%JdfWq@7+1
z)t>zp^h~-I1$x?28^%T!&>bFFg<gI;iV3*FvQFd``;c4gM@|u^1Ybs>2N?vRz%PYx
zOrzz^rxi{s?ohy&kZ(tPUX!Bu5(=9MK54>IuYfPHxB=-rV1dbn9oc5uTiRUa4yr$?
z;D{9f${+@`x&pl&silZbE8(;NTa$7k0}`x4?^b07p{IaPx~0V>q1<ReLsKg>8fkY&
z=;<QQwp5;RSyF_&ghb?T-j1RT+fce`2lNT4sLaSB^zvb*wY5?WItWL9K`ur#g%~bO
z!;$I=Jlp5RS9ea}LQ@sy%k!~pG2(bj8!mSBW3izf$J_jPArQv<LqqrhZRtnB0Dj9A
zz@K%E;IF%9@aO&^d|;`^H|m=3R8=h&sVv*6EbGfFP^+mxquGewP7j7dU6>yn#NzM>
z#;Ne8#zwJgW)4eJvsj#-#nQqO=6CMI&gEqsI(P`jjvT}BQ>SqD!g<_x;|894>PdY5
z_0QpJU->fLC)89`_u+Rw!293&W<=r9)_&{VcktCOe+jR>^dj!RaUG`#&?7r{<Iv(x
zObriXxW5mbUJqJp>fqN~u`4j3z)cl*N<=BW)gRflSc^|DRxiR|$`C7@-!*?4W%;dy
zo`6q!W;DX0cPRE&pf@xygMt20n9WsKmtX8fUXcfRg<ceuc2V_(kyjER{0O}Qs=NZK
zKSD8s>k0TWOT7y8QkhAG4n^@L6*-Vt*g}ijK<G6fso2hG9n#8q0#!c5=hxB(S1J8>
zV+4R{I_yXn=tXR<fKO_4W7A8i{)!1c>8C3eH#JhON8l+H8BC>2Z)-d@#3UfEOoJwS
z8||zSdOhu}z7iD#p4OnHpD#msS~`sJDX5M~M)i&)R1tc*6lPW?stG+?ULJe}g&3gP
zn=Z-2WOfox6MC->`0?eL0bI3L<FG}CWA$~o=ndeZ$px(V`>@mQ!hVAV*J^9=L`x&C
zmzCl-8l3n+O9=nTJC5J7`SFDkBc3kP;Y4vM`gxq@l2TNblt5EljMk<`4EFV6gf@4I
zz>^#q8phbz7^bQ8X2z$mFiUGo`?<Jl7nb(y!OF@396n4~iP}4N4p*;S#-k5CgcqKB
z4xfMX4SfA;U&S{Gw{N}ot^&Ug-hWR4-Ur`&7hfar-gxyDJpJgSxcAZ}oI7?L`=}y!
zPEKK@yGMC=k=yN3hSl;A*1P&fl?&-t#GPDZs~4KQUY%WwwfJ;m^#W9(w|o8!$_v_P
zaRq!$s3>Ycm7$Gl&!IrCuV(_IBU2IRl~x3iPi0p~(3O;jsrZ5l@bXH!kju>EdpTud
zbvuz-=tgFlk2aSuq}5F;aUr$Tg_M$ZLeD|C*{S&KgkFoH_!0^mk!NrtF0U4e`E^uz
z7TV!z#1ecvsQR`G{0P2?)vX}-XmeAGuq`?77WAY{PaT9n;I~1__T=%9TVs%#laE$M
z2h7GQRO*e;==CTwQso)7FzPCx%gBT^F&&L@X-e9XGoVjPgMrX%rM+#*%t3(rm?$j9
zN@*!}XJuf2Nj8p|N^zg70hilsxYlmR;rd!EH8o(er3K-JCKzdx8{^_JQC5so^)*<|
z&cbW;?RdAL6My8M!1wFB@TRVoDz6TEwHkC#N!C<UM7qIgwD8g<_tH}K_YYv8ZvexC
z1m5@rCJ3RKnTWz$oLj^ms<GX>_h1j9cX0m-4pZ$NIeHW)PE+ljJ&S8sui$|P9>9}N
zK7m(Wc^PlK@jAZnh0o(FUw#{3{?Zo}JNrcf@wFFTz|)UBg8Q#t#if&{aPrV0EY412
za%2c2T_FTJ+;BKrVYl1iuo8L`Gs=Y|!cL&4*4nNLJ@tB9i?#Ul;uD}(QRt+_btup)
zE3hd84#+bz1bW?}QA~`_!ECCQ9vL0T$@ieNq8r6!A><bN72xF*b)tX}%pw4@N;;8U
z-i3@3KQhay0L%TzDDxsy!)b*V$;A=qB^5i7RMLioq86l<)BYAVAyeC~G-^)Fx6#%b
z74XI9RAWbm5iuDC#L@1?NjYDJR;kd~l9-Rp@j2Kc4VL3FusJp@VsWD?G-P1W&u-d|
zgw#y5Ib5irg%s#jQ0bMLbf_?C2|^tz3AY+5y(R*$IWYrGnYpm!<)XT{5Vcu3sEbdA
zFFgm-#T7VZvEsDJj1$#Hte7>}SFOcydo9j7?U*%IV!*72m;0RT^21tE2wmKEv}dQo
zpPGO@r6qX0z76kq2JwxCAU^CEz!TakyyW%cM0+b*iu0i@DpJZGE#_*3y22PF=!S+z
z(LXSR!6Dk)sVR(8<xNdZWA_pj-po9fX+QVv+@<89z58&O;5%~gAdVkDhEue;=cxED
zQMp~eejN`{^*utcJ@(k6c;@LR@zj%#<LM_J$CF(4@V)oq+WGT1d*mpN>{-UX#RW{#
zmXG%JU^d)~fetU6t*vNkZblt{38#i4N>8q<+rg*A8uixV(~jGqw`cL}E$B&wMk#q?
zHM?kWorGRJ@3~=2PRvK3ms{vT0l_AjlkY`Vo)_5!Uv6<1vI@D3$}fY^%hz>N+4%@L
z0#P3(@H&xJ=0TQLrtS#%ikyh2-Ayg0^(}EAxuliLnw1KU#6nu%>>9*mRU@Ua9vP)g
zRE~9s%Q7LB;FDe(QnnYL$!T&SfhW-WxWRI?;)^^bEe0|1sjybpqqW%?Dc7@@QEIM4
zIW4JyYR^o#RVAgOF)0I$$(b;7UXz`H((E*ta&yp9P=runIYx@Ln5#5nCxNtVwJL*m
zp7FbJs?&`FE(a#9X3RF!Vvtk6)d+{V0-Dq~H05Q&k+=g#Ypr;tXBe+|`|ws^Sb1{U
zft&&ysMKPTu(ai7!&qDny~c>H&Je<(UJMNkQ|S$2fbbg`A6I}kF)@MZ$w@5C%wX^Q
z0xldmhD(Q!;xLup0ou`{d-vk_ffXD(Z~(`S9wGc_bx)thc>+z~clq)q#r9sk!s(@p
zxW>FpMR<<uojH6Mr?}l=+T~qSlb9VF!9;f$%iaB$^#ssPYg^CH-dt0M8G)YaVOKA%
zTb5R;&7{BM>hiT%i%%ug>rzG#<^1x}IcSR7QI;<jw^<oXwASKQMq80uT%V^8Gt*13
z*4VJFsI(JBr2*s>(#8^cg{2{A4MQj>3nP#7*`-0MydVk<eaO*;kzO7^nkYRDr>kl&
zmGF~Hr|L^Aq1r2TBDu7U0BpvNT(P-L+>Q_={Nkwk5^`&4lkG?^mVUh!1$ts}<1=+g
zD=;B3s{&gSbFnQ^6rX@kny*B++mrIWSnP;Rfw8h0o^~H}Is<fOlLEa`lLnOr4Ge^y
zJ~0JVRCE@mCO!eh87U~pPDKg9SCN(i3qfel$VGcz5nKf&2y;KutYq2k#!9CT2Le9q
zY-_>(jyCLeN}IhV47b)HP-j7(tr`n;RXEYoj1%>ZxYE*rm-@zVFClz}s_}@$grzzw
zoK%7hgsP*toxtluZ{Hv-?Fhj~l{Ye?Ku>@-#p&$i1QrRUmHBxbU)qW52an*|fx|dL
z=$)X=mEqoxD0X*04pQ|UIdTNYjvZ3~c;duyoH}^|r#U^t=}7^jC_{ndzI`~fi^_3s
z7RxiVwgWww>+Qxae-I;04z$<R!B$lTo6(HLiRq7_CevuZ3G_a08M+p0@#)0sCHRk^
zmluJaD7|W9$13#faJ7fAxUh_x>N>0|s|cc`jL<9aAUoGZ#TP)it{>U?Qj<$dOVyVp
z6&som^7TEmwgNjp0VfL2OXU}VUM3Y^t|^2xjfdcCLuy3_Qp&{Y+7-2zT-=QKJR6}`
zi{t`PdUh(vMkM9cQ0>v`u0k(0--!5(GNl%GdrBU*C1n$QVsoR*_oOB_7F)L`qNGrR
zaG(bUgBd!Lkt$D*5~Bt>Dm{H!3H0fiFr}nJpOlQ!gm@HXrXVjT6*;+S$Rhl*(~?n`
zMgXQ}pfW8RwIvns)iq$G!-EB1Cyvto9r1VKh|hx)onBn+3*+)oFIGYUEE0r=J6f@;
z#*9h576+S}@YK*4E;ZP3)7F491{3yHSHoLciWYN?0=!V~0D1^J0p92kE$-+DhQ~%R
zF*b%d0&ihz0!tHP*h>f=nViPWl|y*^#A#e5xGwD9hl_;Wk)=hfEbqZSf^21F1qb+^
zOeGIe2_9M@5V`Jwy~{YTdpA}HPigZfqa*L)F?LD2z^PG82K|_5b7G*W8BW5vRUV6J
zslsqiuVQP((#li`xyY0Y?bgg%ti`7ntC!&~ei5;^0zGYU2g>u?m8L6YWQ>&O3H0RY
zXsz~6>|H*LDvK5CN=p4Gl8Ou}y?iRYT-w`QPSdkGkWc7|)s=F+RNCOI@-C!v{mhC0
zRUcKLt_!*P5P?T%8p9~98$_Ntgmf*HAEB3_bs=5TK|neQ$2LXjCFa*7h0`RGIIXBb
zfnGw6MJd;d&#FX99<6Rl5q6|8Q}bzcbEx>TuyqxDqWF}jq-}}ECh3`xo`-O-7xDmb
zt-*i_+Q~AiJOz51GF0Z}!JL~1lZ>*GmWtx6G~{F^BReMz1todReB|ZjqBu7n+QL$p
zWq?uws@`CMn^teyNsHSR#L-X?XS##9+8x3<zZVBxE-c#Wu!m}{kE(IZP=THGR@@AR
z@iZ;$3r%gfS8v9$Mhkyok<tdPo4^b94<OXnuT)?t&?E2$M~5-V=>iqr&Y8&wl!k|K
zj<)t;Lhorp?<rc?d#T(`EiB;h+?-O*x4gWJy|lanKbh{8*8)Q-!96>duzwGMM?1S`
zb{0Eno0lfWu{1u0g^@vwg?(7)4`RaUK#2Cd-Bv@TXN0b}2;(Ee_<LSSKjC#H;Ja0O
z%aDE=KWDCCZ!JF6xE*?n=LtR6E$CGgH7d<jtR@dkx>nTJcHq##ldzZxy&}S_h$^py
za4RF^iVFP5%k>ZhUX+!0)A|N!b%RuVUSv}d3jESbeG2%bTra0GNW06|RC;;lFfuEt
z{3={XDQ{OQGbD5L9+cJuQDpHWt+Yh}Up#GYa(*45XJu9+F~^MLJTuaZs*sQ+?f6O&
zn_h&N)I4mb+S|SgzDQ%`M5X_&JRvPEE)_19A9c1yWhiKkQ4g)LlJi>VwB@KM$b-J1
z00!xHmYIdJ%uM8^BqKjP73Br_(3O-xS4uc)D`BEqtTI%g)>w@y+T1DvtC^O$kM?$v
zD(^&pFV6IYaop`v+U*^)*5fRd+*DRN7OVBxS!==lUO&D<i~EtU7oV?d#c}Q@R9uXz
zigI{^UFaDeLU(^Z2KxswGBkq0aYAo=1e2U9DsPtHn;gTz=_y><y$6r29LBT9PvhB(
zm+&Ah>-k-~add70d*&Chb8#njE$v#(oqWBd<gUd<EK%9XgMXGK=jX6%Y6^3N*&L5Q
zPw0(=T<CSyQ|Z|;?QTb_)dZW>2#Y~SIEM+mU*d0hT}i)NslZsh#yBs{LD!(S7N2sg
zUXaRD(Ky|^bdf*UZr<-5N_#y`aij}coyDg>ug21fqesrr-rBIPyx51LJQtOYo08B^
zLUy5ySzOSGvXT%AivkMx3d=(%&`51At!=UQHt2bgQtGC{^C7v6wziz=PU}{bo<J`{
z(~bg@PielAR@Q>#!bZj7rWV#K&`Zv@D$q;JF%o*!grBIsa>Qj6BQ~u7JCbv;LmDiv
zLQftIDd3ZbfN$NAh??34v^XLoC<yo}m7%5#(CIWNEzIW^LAd1?K$nw;N@jT)tt_X?
z>w+SfDzucn7FuK@zbFHL9jee!Q;Qk`Pat5gtwL9W4ReIu!C(*<g59|4aO1)HR=n8k
z#Hqq!?5V84vc-V=+S>7$wF!UFJAwDxI&s5l!-!Ukn({KZydDe^Is*fu@<uQ^I)*_4
zZg5hn(?@A<$1p!Nj>U;l93`wyPEF(CJu7(r_!&HM`~)62c@hsEKZ&z@_7P-DSY*!6
zE->e@Fc-=B**VNj&tQIL2GgSIXlM5>%wabnI?>mQvF<SDhx-Y>AbMJA;4UeEyR;al
zf-D#`MbMNL@*MjWrT2Hdt^|0Z^wboU7wMI|7Hjb-#wSA0qd>2`z;O$D_0^rUxei#2
z%{X%SH0o{5SZ62?LPLA2p}nmv>4Ls2jLOn5O7i?DW)|lAkxv*(lNJShlG<+M>$?fD
zFx6Wxat$G5RniI@x{#ssaVq6|w7jNHE|&_9h`miOZ$(N;6VghWkzQg)n!t~0Pik=m
zdU2VR1fGfWx_<<{?S$U8giHl`>j}N}+Y@llwm58wNkmbZ79L+0j3x^-RA(~8lu5?d
z)9a{Si%?!z2puh^A+HeC`6aMq6~LOuX?`(PAeWbyD`V=(fTH>e4XX48R8pB$6Pz{G
zCbTr#&}OZHx4H&F+TM9nEgo(4<2PJA_z%HRyl%ANvdN4KjrDlk@5e=g^PN@?UazXh
zgN@CYGjSi4l?ZnRFiLAXHZX{Z;V}%4jG~Xg8<-x$IDt1s@J&<oElg195n2a_M{(c6
z9z1j86mA|mf-46ODis^&_O9R%!MAH^H|A#MF+Dkr83J;au$!Ko;(j8PA_BcRTG|EL
z<r%_um==AY(~qHm7k#b<xGD>=NNanV=ha+V2vb=;FOYKdc6afL62$RpTD`b##aczi
zTKsE>)$8(?%Jmq5US)}w9HYf0^vVkx6zIv5(NyRiK6DDLEge`_ONiB$1kq651AFBV
z8nuJ4mh}<_UC`zSxD(paTpvofPHq8Jp|%GV)g#EF(o3hU&6W{bbOFN7uK-W1ZMNQr
z!m1$h%>lmeroDBeq`C`*Vs8n)1ggBW;%21K<|gOYBD1uC0JI_@+rVWeB<Jc8mtIC|
zOM9Cl(94T}Pr9Zl6&i^O@YZjQ!`9d&q-EuyE7S|Kr3U5Y70U4Mm6bZ^E43&uDMkfT
zOX%qZdIhDZ&MAT=GY_=|B`{N^X=!~8w7FF}Dn6z~(ohL&wHX$R5seLXu*<VE2vsw;
zvnOU^f2j$dZwui+geLK(xf$mwjW|+e#<g}QP7s>cD)cy+SB#4-t(YP(Jyq41B=9DC
z`Y<&xgoz>A+M!|ejtygQY7Aq9-sISb(v)NufwgaJ9LEW>tMj|?=%Hh{|L8HCpshVJ
zJdU$-yKrXTL9FcDgT*O=Zj^xI`ZE&~+y?<VGJ^5pVeFin#je>IOb+&AnAUofDz)F^
zK_}ttG*!Tsoq|ci`ebJi_7Z+Sg*k+NG5UIfypW`<gjbU)@zjv!DnE~0Xlt<+pIXTE
zCzs*sH7d~4mw0%eyHJtupv7%~N#j71RiM|3Dx)0-S57L!ORej#oyBPFJjSgHm}}mL
zsh0ids+vKoW(f7AeX#4tpwAD|`gRd|RBmNKD!hK=R(2y>8zk_k?zEkXwawD|sqO-b
zy35pd(E7SiU?c!derOwdQAF^i>)H`73Xhpy)JWLL<0h++RcceLZA@A@QV6|-Obsn<
z5q2aNAU3&>z{{o5lXAUuY>r7%o`Wm)c6)3J(y83qI=ra0H7bpib-GGrv=zNp1HGmK
zg)(Y#ZXPNKH%loMTu~{3R|IQrK5DqEzN`!_RC@JXzq-5>HMGSRJppJopw3o<YHJnh
z>+4ZpYePBJp(!H|u9O@s=2qZUXEz?KYQcdTE9MEwi&Rn*rDd4a=rC=zAWVxKu+?Ic
zz?tdo$9OP|>Ha}Xiqccs=#62Bpc|#7oubNHB&<%(FXAlo_{1bm(XQS&co0_)9>4)Y
z^4i!OZcgvSdBX7g-u*beXBo@VlVflYQ-ed?@35jI$7rD^xJ;gXG1E`z`Ft4j`7zw?
zg3D-vpLV_`BLfXtIcVZ{QB_pPU&=z17Z;$nFU;%d7yR!NZ0srofBEx|N%#-{AcIeF
zar9H?qtxKfekLY4644K7R@YIti{4%$`n7rt3E7AGp6pxp6aBur?597KYM4l<^U=%J
z;@^0vml!XpL(A7-EcHTP?4nq6Fk4_KZAGgs0IR8$BF2v8T}KcO^<dp{$6*}zox*AF
zX<Q6n#>wzS>~tN$aMK=iR!zfgnnGP!KTPF)w2pnq)AXXiG>9BsH`2>GmG*ksdfHok
z0GX9ketIuK=t92HOE9{TNAP8qJCJAePzkmpRm-e!C_OZ!OfMzhO6XaTU1md8iItXC
zL+BZhoK=a~lv2bHe6cd@RDue<GzEHFb|hib)_81-Nk(!SZ`@WV8tkpmRvMtI)DwJK
z7-=buoaSd{B0D7oS&2ys{PYEdsLC%uZGOIzjZ}Fp8V#B%%3v!mflXI|I)e^1RC~2m
zQovt>x|$laHrvru--wcoY*;h1;i2l=tF6X#ZYkz$RvZj?aJ9b=vsG1?CL|}@+R)>4
zAnfkIcy~A9GoZj}Ngy>giLpT{GFsfx(Gg5g?afV%V>iJi3h(ID6!s1c;n?&vuC1)#
z#^FOaF+Pd&{bM-SGlHAT2e|woF0UNGsoi_9m+Q@m!XxZP`uZ@);|+$x81L=HSSW-c
zj~C+tJ!c18dJ~!oOJU*XXei-lrlmI()9RO%qP(;SVJbahMX9h5L<wGQAxvhNzaV&`
z!KQ}5N1&v_B^q3xv`psJ{ruhENhx9^REWuYx34cD$GH7H**^NUz)wA<3f@mT=e78^
z7q|Y5G8O3Uxdv024~8N_FHfM?N(eg8THlE}ivu-A2X-wSMsH6))}08S$F+eQxPRy-
z9vpoT*G6yRSkDFQ3!TA)a|OK(OYm9e&}^ANjb$9AmHj9+52MI9h|CI6dLi1_KIE9f
z$f)!ad|m~7VrvC@`Fg58LNANJODt(de2HC|CaqRvNO@jLK@DNrhyraRQu2&Qqs2|l
zF(96*FE*tFaVf=Ae0fxR*#uuYw!|i5YfK8ZY)`^g!Y?)<4OO-ERC{h{bb6{isl<?{
zoatfUv>+!N>4}L*iH${GY8pzjvQU|mL-6IpT2O#`-ta9I<y5y7u*;OTxQXCvrM0!2
zjcBc|LX*V|S3@I0?H#Drlye`csL9Vlkihh2Wn#R^ipw*jc$}8<8g1;ct|0aiIy2!Q
zMwuf$-5BcY#h_TtVJfnbag6p2VO-Q6K{z@(q*&X11H(8nHi6~oam)?%;TQpVWpNj-
z@7arsRDTy{<~d!$>4hbw7Wc^f0uC=OVmF~D!0RXA26(K#a7Y<QV1Np9fYU)Lz41U7
zhFl)FEj5H)IT}kV&{0)`W`hx?(o$4X@s^bqBN*;dRGt9upMstm(JD?N8amOcE;?>s
zE?~NSKN5fc_kVv2QnHRZuO8zkKaoZ?kx*f)Ml>`fZinGo{M(L?f4extKl{hfb3>Q!
zWVXRr(gsI;0FBlT)SBBdKfND=10z^>s^=0O7=H*)&OU)>m!8IxyPn2_JD<SCnFnxe
z<SKTDPh%l;3gNcBXlq=6(L4fu^*A)l0zGZ6zL(JJC-l0JWsu5@0CEh3pT<S#c~C@a
zE8r_Id681qio{Yol9*y|1$t>k(q69?1-fRG8e5TBT#fVsGg1h>g!Br;$&gbr)?QM+
zQiHoAiE1w{mC$4ENL~eBA~aMl9bVdKsf18c553gnYPBdWEI@8%1~TONw^V$&%u<4?
zGA|dF0;)Y)I{}`Hmbin;P^O(06M7qM=&Y%NhicDnt%lR6M-$ahRbCFEnFd=?4!UWD
z!(}BHwOMd!Vh9h?u0AwAfujW8VxSWfT>*>`Ktnw}7?H{Z+SvY~0SpfeQIQQQ&=Wg4
zN((wWG>Da<Q5+f_$7EkO?P?EB?_9zaTG(?7w7Uf03YFl2(J?GgHBNT-V1hZxZAYo@
z`hp>Z3Ai4DubZ$7b#@}`XA*clUN43Ty?!b_uRH_9%+Evkc{pEhFrl)z1lp2P6!W|2
z3JUZhuv7EaHMSZmL<AtwV3PRrKmT+5_{TrS@BQBIDetQ=5=edj``^cZ{jdL3Std|Z
z<B$LNkMW0p_=n0eng8&^5AoZ-{oBg=fBn~gt*oOSOQugf^^`Iyjil6u4^b75Q0dA*
z4t@MC-gx5;<rwO@MFVjSdjIxAdGw<azY0BLv4_BGM`c0#E$G>8Uf8N!m>AiG@v&*F
zJ3DY456?c1=XO1V7nh&IbNimfGY6i-Ge=*-BL|<wrG<xZhPgCw9{v8qXla~>)iwiT
z-89MxzS8PZ6ju+Sz|xPx>ON*K@=QS#nu91ZccH}6r9iLP5>V7$mfl4udXxsrsf7(l
zFBY|DRg_+dp@kN=MzOb&@fj6}PcI|*N@#Hl2){gL4z|UoE704vs`fT*jY9#Ii$G6r
zwm@fyJPJ}C7$g-M73HPK&(1<-Y6>z^l98R7iu{apl;mVVOBHC&&qH%Lx2Gy}5q=)3
zy@6UQ25W0LtwC?K1)~i%4AoiDUQ~dl^fdTskwf`~N`Kvb);b()YsEvfr;m<}<9<SK
zUw23uoM23p9u?jYZ^TitpOX_9Al!xsx)CXRBlM<*MleT(w`X_^D+JzBPannzsv*A@
zlLX&*e^{w;oep(jiC{ZO=uHr+gP|b$2vt#ZJ%n7C>Qc=Bt*oS9AlK0WuhWTus{?_K
zcDS3H(N=3iBhS6TQVnCd1{FocC@1&|seHR>jpdTM9eV0zCZU2ug^Gj<JsA~d-MV$y
zx^*jl>$iSOStjux|KoqeXFvN{6c!dL;8WxCpZ~n_#JU0=Th@{CQ&~^;anC*X;4`23
zjPfA%AN=44%KCE5|MuVh8yXrKlyxNUyz@?Fde>cdDeE6Pbm$i7R6s|c>stKVj(-Vy
z`a&1oOS@90XSewXJvRn>=P)<BgmowTuj2lhNATS8vv_{*GkA92Q+V#sb9m+K=kVmQ
z=W%t{L%6W>Asn5(iG`sH=<Pg=Hped1HZ8(J<!5M^MP<V@H1!jxY?@+Dpwv15UBd`8
zb^XxP_M+GvKv^}du(lm(8V549PGnH^WfV6ftE^d3dO79wD528JDXT>?ftQpMsmu_y
z7oS#2@D(x(n0dFLrz*XTTVj!sQ-BsHt!JH0>6xLCiVP#6XQ+gRb~Gz99Vv+kO6E}Y
zWfOV@*_kMz;?w2kpq3W5iAtxHcGX3j*G*d+;CACSD~7Ah=r`!F)ZB>0hI-8EjW}$m
z!->jT+}rHJ6?;4G4FvJn$S58o^e$7;Ep&Hbv^R)Js=5iPkb$0F^fQO4)COpOM>rkg
zbWmz^hexn9JdV9%lUS4*-L^J#Rn;O;TL-_(f$knZZL1Gcy<yA|bPH6CgM?ilf!ae=
z7NnvJQN@Kt=`jOTcQSmHkHB+xxX|usQ$|sdM_M{tn&D_}LUUa`s;TfyIz2Qck%uLf
z5qf#Kxd>4y$|dzrr5dV*lu&^pDlIlP7HMf|3WQFaIH4>TAPTfLZQ6u8?zjW;#Jb=A
z{oluL{Kjt}m&cTKRp_<yTw`KlaO1`ey#D&@O5fsb+qUrstP=m}KmDg$5R-Y?|J2l!
zl5=x&%CnATUjn_`Lm;@O^8UXOAOC)FCJK5HdmDkCv7{aL+D<gpdTDQ6=;@lm@}2`&
zx9C2C(*u|B=+4LSi~#T8GkEp%D|q(Uvv}<AQ@FPK0YdKqoZbBpR;I6GVdMfW(_y&W
z2hreIMvZ+57W*QM&2uod%){)MhtWO-lYJ7#rg4~>Mxm|kgSM_0S(<jFl(sPK$Rz-?
z3BFtn;a5`6`3Us#G*o{1rd8-^k(eRiD@SZ<5#d)zrI$;4n}%(%X$tTn(2G-snW}BH
zqoKtCtx3k#t5g~(>-0LN7TE+`T52+q65|yBW)Xb3Sy?DxmgVF?FEzXQ1+=V1ipA}u
zl?|Kpm};uW6u~z@08X0>IAm|ex%w7-wQCX|bj{$Mo*6vl=)|i-<9LQj?WxHbTo@U~
zK){7i*oT4sZYsI~3{t%f_ezcGFx49spA0(C+fO^zLu*Ex%XOsob*Ke^W=jA7|MW>j
zK~%pN?e#W{@)#pFJKQ{eFyKaaryIke04;3@<Fv~ms=%&*Ujd#dx=vAcgk7gkfais)
z-35=!gAQjq+FBiO*jv!j)PzPV)<%N6uC`Wrz_~nIq>Rv$EGa5MR#r9w1ZZ@qu!~NK
z=yEc3{>qgrit5_EdpC0Uy~-mw1x^xTSGRB9uBb6tCoeBg$@S~kE1;6cbN<Pn{E4FA
z<eA04^E<zzY$tK^=1tsv_uaQ3DggV@kA8#?KKMX6w!lmvY%-Y?l_>p@Wx2psJvW)E
z^K0>MJ3dzJoC@@ouM&D*1$x>%Dbs6*q1cJWYEgL|s4;XP+&PKEhfZQ$xONUR&iy#i
ze-0PMF5&9b6+E=(K|FQzDLi%ZSv+#=X<R?>7_J<69EazwV`b(##(GYnCwv;M?JKCE
z+OyE|s*2CtGKcE61ynocU}>2~b?X%08-~7q5CsMwa%pc1&72Z?Sqk(Vw7528meiq`
z_BI-NiCH=%W$F~@#ibQ9BhcGSTe~AZ<5!>;lb8mxJhPU{Mn+rF5l{lX3Y|vjol#s|
zL`9K<l;k9&r==2lnaJS{o}ZnA;@muxOE)ztj_T49v{#wotu|q#wF%P>I}Q*qN4;+B
zBP>o)H9g`E;*0GAc*i}87n=PzT&TrUp*}oJh&?hkfnx;MP){d<{as4)kAYskCh(*T
zZ=`1cL*ZV|^Y!2$`Ua`?m;<z=Lm^t${vLERHNt1CMyR?DPFpQ{gH(E*J`8ttVuTjA
zm-8V4FVGo*ohUjVm7SN6^K`i3Zg-=jt%Flhc}@j*t<0u|M#9bttF;DZOBKHt3-nZb
zg@j#UVX=}~Svmg*dJ@sVlZ>{vDoFn3Z~lh+wkj%5Z0+I0hZSY__~VZ&@Bi-a{;pzI
zT`remfdzJAfu--TJcL~opD4wa7XC1%q$o;I_ASTy=%bGm^%thCm)GyV|30d!s_@)%
z&nf$s_2gVE7K;K<Id-(2jV{xRo?nZ9>%qVAD)fw{o?Fn9)EBj(-r|AP<V3Z;okC#(
zCyt%Rx+ZNuIt+stZC%3d&I4Ep9mJ!A-1BE&#0wW+!IS4+#G{;l@brtgeCP?B+WjEr
zht6Ye;$HZCN72x{Qvsi$ZW@)gDO9xz@D@<pzJ!{#d9(%h6MT~}H;*D$=R!_p2f^2Y
ztco_|Xxflf-h%A%CQj>7Xt1Nq+=jFQ3sUk-NXUplFEOJ6JCbQ_3B9CDX~L4NC_SYH
zR|X*17Oyl?)*3Bnb-7`zspfp7J6Z)*Tctry=oM4#WmD-TA|ovwx!HNh&dj0B%|af*
zS4ikdg@?98x}}xEsw+olV;y>#dkL5$ejoPO>T#~agPZOyJl;+_>kQypO9u|s)Z#dA
z)F)=9aixC%hx&TZPn8$!4^qi>6MB8Lu~K<4jFEonhBk=MU@rnxfjyi`P3u@7gy|qv
zUuPEq*9kXmsJqn;Uq>5yq--x3RNC3~dE5&0x_pEj=RN$qUZ#uI-s5z^+fH@Y+76ea
zLz$Olt<5cH<hj?=`rGPiQN!;?Per6N82SBFqL9`$zo1CT+}wQlsTgmMXyB<R5vT}2
zROr3??z>1$O;uEyKuszb#LCvx)F?m{=<V3C175Ee|NDRcZ)K2#haY|zfA9x?pg=D_
zKVPx8O;n@;JyC{g2slOQ$xvFdA6Z|3`O-@--Kr3g^AV*d$CP2W)bmr{`y|*!FOzkm
zVg64=^n3EzqxYeH7i;nVPW(~^9N<^JZ|MSbh3ynBZMV`~=0cTDnya*-*5pQCXckv4
z-Nd@ed`iN6H|*Ll=KT9{YT^Q(IQ9%)IR6S>xbzyHyZSji&3x?QtGNI43%IoMI1WtT
zz{1#d1bxS7ot9xE^vrcLsA`%=m3@Jhb{3}Q8QA^%;2%5zW5WpawgKpDy|lSuWNX`$
z_IvpRU=EdDuBL_4MikNBmYJPMFRVgxt}z0=tV)7Ti<lIuywn0(+<a_H$hcLmw<RVS
zo3<w?&@0jC;h^F%@+PaOtmKVtRGyn*pv97#B%4+?IWY-o$y9o@!#U~M$jiz@K5ukU
z8OmrYC56yx%b?Sgz*t^{dQB;M>+7)W^-%5kaoXL9%ibWa2f{c<Wp$w|fTM0FmL1Kw
zJ~4tDlM`5`8XN5HM7S@AKH67-o{Xw8D8S>ir@tRvLxkQC;W)zO-L$~|E{yww=;twd
z0|9gfyzn!_gzYe2PZNft{A?j-8@k+->IAXREvl}As?G_g-GTO&Hn<2pXOjc2mUeiZ
ze4mQ1Rct?jS64%oM|EVDE_wu>^iwX^YLQP%pGRw-OYmp&2f#<T-X77wlN2i{H@|w5
z|Ih#VKNU->R#XVUqzo=OIayiu+H0>VHM#;Wzdy2n6&6z4D?doGjZ|>hY&Hd)Z@>Mv
zVs~G8<rM{{QrRKpf}$ejwb*1Se>{5hC@x&Mpw#pVXhrGCGTFa+esZoVtYq2kP>Tka
z3O+T|&l$Z<^!gI=T3vSgzSrW{75tmZczXO3@0mY~^4wOGW;Qb^p0ZjLrB|bOqLxgl
zGkY*Gyo@KGcpmFY@@WT)JkV1Pj<|Q@>hi;Q{`_lr;Q~Q-@;N+l;blB?{Y^YZ06t6*
zUOhmyxAYJehAv?+c#8BqjdtfgSgcd1ZkR`neGzre-LN#zak>+2ffX2RBQVh3YOUQU
zuj!)VYiAO8TBoA)atOU#Z8M5Z4rr=f$S#%23?q_rBDJ{5Ih6!p8RFB6sP-bz+mV<V
zf!>Z3LN5{9;*yb-Ukr!KOPgCq=;>iFo2h0vCG_M5%uLTfCRJVrp_fnK6=dX~AS)O7
zIk`&tQ%-&^3Q7x6rYnO+Uyg!;EP}5XjkL%^&COWp4C0tSgd1a%I5#wa0|5_?_Yiut
zvU|H+IMN@&nc+b!QiYB7bP;$V^b%-&J-rI-<e3<Kef{X}>qUq*wwo$%Xk-xmJyd*D
zcai~Lq^wR-fY<N!VU!@8^aU_T>)O@chJZ)PCwx?L1YT<!I$9iPYqB$2lx(SQR#JT}
zl@|5YR@ka)P-8N~tkh~(LPOw{iNY!%@CbeR4l-%`MTz}O5v^#XDa!x)U;j%1m+bd1
z|MD*tCHLR|`+rv|BBWx&O+bq!73fKm5;X*#fAJT8p*+4*%J)R+$+J2HTv_~)5=$#7
zwYQ)9+~*XaNI717e7v%bnoCPdisJjzKmAjsB1Dv)8Y-Y~5BUr#;G&z)Nc>CdOUO2|
z-tFrO{61;9y6pDlYw`bY@OzZ*lDrgm&7MMOHW`vGod=pwp4|d%z8!U@4y9pooyCon
z<+FJ8l{c`iyu1?zT?oZlb~vjCabfpEc<S`4c=FWCxOwPFJaYCWJbUk3c;d<%c<RcV
zc<99QxOw<F?4#0~>OGIC!OQ4$AA+@Z2GzD%R5vZ4$+Zvl?Ym)Wm_nUn9@Wj0sJ4%x
z*yyLaa}jP{<m=spUOR1WD^;I^*@9AYJ9JhL@|8x)rqwhm-O`e>Dv+32in#OwrLA6!
zwAYgwT&4=W#I!8fY3HmBO@x*KMvFzUw$fhD$k#cUSt!iSLtaLv(j~1pU51%TMGjSi
zRBGfCdWD2uQAG(#3BJ6-Tq?d&n9EDyG+Qv@aAMBs!KvXfoERU%>B&JH@9n}Kw*xDE
z0UR3Y#>wFU?56sg><(hECxkwMp3-(qDlPiZ)6;`+PZ;6eurh?xKz}cKLqT*Cc-`F2
zODpRm(0ncz0)$!@Rb1HFjy?jfyS*J<GX9;%h0cz4!mgE2Ylf5d)zR3jWNSk+8fyu;
znmROb+EhzROW4(#t6(MYs;Tg*qzxk>qapChN=s0XpNHI>Yz27f>FMxzJgXPgKOeU%
zo9}$*JNPgE<-h#OO)a)o+Utor`>VhDE5(}1YZVL<fBUz8t1Oc;Jb|T@>xrThW%$D%
z{!oFafJ+6FtSfMpvcrsw3`|Z=-s;UEP!iSo&2N75)@wE7cxof(XlP5QujM$B60)x3
z?aEIL_47w>EBlWA{GSy6()(-iFGu7%l>8WaWjS_~Gb?ghAdQoC1p>Wxn01Z(;qS!N
zi;v)`C!fW-GHn;iD>_j`58P}SzzqWJxr=Y%;iE6$>GN;k$;)rzL4xnxzK3xCiD&T`
zq4(&q7jR+Elh`?Q857-S5Of_tbHf7aMCmomp|)iS4&MRPx9ubVccHeGKqT~Z)*gb-
zhZ3`&*|`e7HWV0~oNq;m$q9WO->1z@EwCsySD=@aEjG7;YOjPAR~jj26MESMUZfUR
zpeOcrM?wnfXyuxyex$t0$eUamDa#FF=Ia6~x}uys6j1S1WM`o)Jr#w?i6~4>Mo~sO
zigU71T$qOvf=^qiLAeY<s;hw2VuFiGucyO>fp#}0JYMVy_;6u#5SOXab~f0s;A+82
ze-LNJMzE9iZH~)_y9qV`yg)<0+Ajf}fUmck>Mk5YI2=T%y9-@iex;3FFYRQ%uM_>j
zkODoI$4zV60YB5()=K5lj1ET&Jgs&FcplvZq2EDu*Vsr!*PuMQvDwyuX3jS;>uc+7
zrHu-&#$-`Oj<HbvT68*?2|PW4r!A4Td4>ET$RYGIk&&K8+nAy}4pP0OekE>OsNVGd
zKla`NPO|el^KQ*dGgVg?Gj>&17d108<1jO7NYZG;Xj!(*6vvTc#}HG%F+1znjxjL@
zyoS&w+2Ce3$!@YqOwac`xB8As?U9q%`4VS;b4Ev(p{now*K=Mt?{iu4b`uO+VIwmk
zW16{-U}&1TPUqCTU?g>E!fq?bwgO4u(lMqL@2hQFKZ~{(?<3PbI)~O5gKO*k%e1|C
zA9H!@`djwHS>bP^@38>AJ1#tqTzuN-e%6+lHM}D=jgj7YKKo<XSUQR~KK(8{{LrIR
zdL6qFZ(c+?wSt+!lX&5^H{vCCz5(|ieGy*w$UE_ZYft0cz6Y>t@-*%}@q9e}(3|n*
z$KQq5Jn%L=c<wd0{oo65bl0Ppn<U%@&k{NpF*1D>^BWIfYWZ&ZT~vGL(K~*EKs$hz
zu60B4HL1H=uAubd?XwJbC%Jx>>TrW_)L6U2h-F6*NcI}xttXN<l%CZSFEmniM+rK=
zQMM=ODaBM>qZ4giy_jEGL2ElhTE6H?Gu6SDy^HhdL^GmPcM+EhDX$YbzYFbQKiVTf
z<iiAED8Mf+ineqM_itlJ)q|d%ZiZVESZAoYlQ7%6PDl|7$LFSS*TO7b!;tl5d-meu
z{3On=%;MhNJ8^*u?W&RsaJ_m*?A}B8QSt5NboZ{E*v)+eyp5G5>|b8R{)J^6SYE?{
z^_^H|7&u22H&0b3pPpuTIx&Xnsd3CsjAD6g6dO~MSecr_^oW$)AjXG>jX`}>W~Zho
zgLKuR4)ph<j|#Py;k{C4brX1kUN527L50^!;ANU(NbvJV0zRWjkDEUNi%au-ob<2A
zv{2-FaZQK_8hQu?1<ji;wFwBB2^Dj@T=QGaTqngR(>b;R$+Y5gnRYV(nfFk<ug<CU
z<~Esm|1xvG;&b23bWHI%p9_Gu?BCW3->Li==(Q4b?a4x|t(vQJXDH+bmoPpwkF&?`
z!s}l1W+UP%$#(BSyk!Z^%}eOX?Z%<GYdB1R9U4=nkJsY%eUD(@?Cn?^IgaZ`AIBRW
zek<Pf)O+yCyMG2xQSIG(`jxo-$cv~<uJJ`XjhXqoF|%|JmUcdbsikX}Sh$K|La%4+
zIAOLQiMC|}da?Wxfj5U>X37X}Bl!s=J7&-`z6;%B8%VZK8r{;OnIZU_x)5mUAo$cn
zBT2=VgjK1g96_UOPYFe98XN?lg8)xsh45(a>_)z|&1fgqN$7RvIiG7q*zH5i?LyMk
zfL6Z^eK9`<n!=oh(ajLAC((p~Y?j|e&H%~C$S^<C9G0ZO=qo$dv9Y#<gA0>5J2i@*
zICvOuJbeattS#W=`~)t|&EoF04XVDQIJZ}!>u&6#tEt7#ojb9yvCgT^v5egW+_B|#
z9G+XjE{5eBi_2J~N}ZaWLGRoI1{bC=LuEHNG=f<IZfazRYHon*MleHlIX=*jG2Y|Y
z@DK($FF^P7^`e{J$@z}1PC~yO9R#=%FLV)hUAZi}vl%`VNy>j^o{1nq*oXZdc-;=T
zoHi^h&hl|m@7to6)qKV_T?q$dg2c_BRJ_hqV`g|-yiMhSdVu97fJ~U0;iAmkM`prh
zE6mJgZ7;sB;_^D}XP&?KK4d2F%;RO6n&&sy6~o-L;%jW(rekl}{~yNh^?UrA@aw+o
z;uGj>Q7;Vn0(vq_7-(ChFj&F-)Ee%*^bqd9=W&dWj}dyEdysD5g_iUh+FCX+-**xR
zCa&P}`XhMFJwJ;FPrd|?o_hrzI`v{adEu3K&9&F#wb!Wz?|2$7rYgMW^viMi*o(1i
z`5uP27qGVe5M~zc#uAm@{QA9^UA@~-dR-$&(K&hq>7Jd0+bUXmHjwC4+1>(@T?<rt
zQ%DecU1K{9@Ckb9t{JMmVM4DHfkY>~(VPK2XRz65uh*b3*BvgTkGB+{=L^TMx^oX-
z`~gCbu<PtXyHZW%^5`Q7!(M+OlyzH?^)#R(;6zWvZ$M9dYWouj45rf<p)#Bv8ph)6
zEY?-tc3?kt9oUCODzv5LIjoHI;@r$Qo?vKt-}DR)4fWy#L3MIs9QQJey-!`s_8u@o
z);&^tJ9l9xRidF73BJ{pMeJE!!KwA#I5NM2^{E*wO-^Huq3Sfl*v{z@^v!a9bQE*_
zLsa2|813&v|3D80hWe=LdKq?iGlU($1pkmjLxboR=(_oNRXa;J8fUk)8X0F=^Eu?G
zU~^P>c>=F3nLsWUMVg>*3i=V0$|LZcjz-K=!G0g~WLn(x?-jk+CWKz~s#oEYpZuh;
zPL&f{e%ssLW`wCGFicZTtH$Xq?xv<%T=l7a&1+tRm%Z#|hQIc;uQhsR=v-=`{0D#V
z2l(5+{ad{0O>Z)qyJ+}JO|?zq|LR!ntEnnHbls1A>|@3{l_$#76h_;B<yU^ixOYLS
z7?A1~r+ZVSh)mE@Utryb?nl?sR0FNQ^rbH~mf!N0w-{xXx<9Sg^<MePR~oM4G&S!_
z+i%&wh4DeubIAX$Uw%G%vy14@Eu)Xqk&ZPCcP>#JtYB$&19x3{5Qh$)L|0EIN_qy4
zBiptY>FiF1K>IN_d<F++@5C$adJ}%(MIXi+?|TQHzV;S8ef_O?l#1@Yeb2)q$6t)+
zU*aGC(Rbj{t3QboyB@;E{GAMOFJk|`$FOV9BiPvcFcvrN#q`RZ7@50(?(t)29o%1l
zT?f@(_ZnJy*Nrm0+1;0s>0jmiB-)18F}{2ZS;pdt)-hzdr_q!ff-l}q__Y&wDY!x@
zG`I`N2jn%4N+EAY4MSY@%$Q@SI6OXue49dC)!eqDy{*j%bDI-M#0b4~z=N#MiL}RI
zc$P}1BTB`WYQ_+i-WZ{`J~fTQgw@GIhjHZKLF{5^y0pH6xv4QMcIC0VBZrd%y|}>8
zbECTry9vJ23=OXxI*4n>jxqE<gd>6;!MJOs;5&J5YDhf4IFIFJf^U&NGl%u@2`me6
zRC7~fLl_#P%A6SDuhayl1iU_~y1`-e_VyVKi`6i<x3>^}cXxLi?bD=cjsC_AW%CSU
zRXdxbV$JgNsJBCkpG}RlQv`cHl|m{SF}@2wLt3xPY4lccak{|pTEN?ayc@tXHxw%x
zRamIctJ>bl1RV9cP@6gRj*vPs6^{U;!O2zYs-`9~wbj#ontg6{uHxERovW&<$^eq!
zr6w!I`{+1btN55pmo6F5Gn=s}34u&7)v@Z@_Q4N+&{!wa{VBPD8b?=FR^k(%_=Ew$
z_rCYNMh&ugJ>83@x}Let<#OS7e&=_L`zyW=bKX2gUQ<)EdBt1y!yBI^Jx=<q;_vD8
zi;q&>O<^E6kD=BD4CWQyjxxNR$Dw`42)yGMpO{9jy%i;WgQwBjc@UY_J;)J!J^4Kt
z?>LA%_C61<A^2W<<&F61yWWa7-TyAU@bqhN|DhM+enRis@fYH*lTYE~?gz0tc>y!S
zr?Jj(_U^l0kIMw%o&%3!V*WBl=PzP%<thed)Gh4@I>rtm)w51l_LW7#Zxz{rorty1
z5rWIyXE!F-P9Q(9h<x81n)1U4r+N@fcEKIV5qN1TzASA1IIJEG7!*XU&5P;=myvQx
zJu^lo3DU(C<l5Btw#&$`r^*b~;wF<xhH^e63Awz=^WqV-QRQXB0c05FwZ&uTAsl81
zmwoe#MyBJ-Cr;tq@#986*V)-=jC6HiDcgemtyvtW;yO1uZX_i*I68#m^Rqa&YXhfu
z?=srg?OR<X?9`aJ(D{F9VF|Ny^BA9<Vi-GxrRgccbqX8f<5*^xIy2ad@j)uM{w_?7
z4jK)VC)JCBinW)YrITv4r?;P>D;1s5w^*sP3ZGBuwAA!N@KC!%xnQS$uF2+RG*R8f
znl$=b3@rqoy5Q-%5b!)sCsm%^XdgE>H&@KIXUtr=x#0&v?+ah}g7Jb2M(TN?c5`MZ
zE0gNcHWPYk$EW>O2K)HqkK?h&9y7X}X}hL6Uh7n|`!|2{H-@^>e$RjY^Ktd+Rin+G
z+5~Ffr=EHWFMjchjeflXtcn(NZZ)6L^`*Q_2<gW{HNED2NtLR}jGES%s#VQlw6EZ)
z>j~nzwgy_4Ys_JN9<?!4ZL{`~Yny(YbgY`}Xj#We9p17Z&5TCL`jc%!@4hRKQQ=LX
zkKpS~PoleJ6#cE^jFc8BGIsI3T`*d6q}y^R85li__Fjga?R${P@5ErwA*@YZ#Pcry
z6yEaW`|*~?ehE+C`wqP7&Nt!y!!N=e`=7v_M_!1_N1l)KhaSg?Jr7}d@^%vSBo6O?
z3@>`>y?F59w_(r0$1pf?)~LCiTD=4P)2GqF@V0&Q5X0J4hO{ea>R96ZGGc9Nw>OJ)
z-wN_WRA~!`80M~{e{v_{RC%#XKa#BqbK4P2v?JWyVf4(fd6hyw2#dpSq?@X@d5l`z
z)~-Hdd|5$HU9D81A>heVR5(GO7h$)Z;Bzy?3!}9uy6G`=G5i}OICe9vJ-)FEXZIh(
z?F?1VQ0*y2(=6dMs_p44&b4>pj^Qy}qf%4S&|||RMti>FON%(Xx{ST_HLAH~PUq><
z3~9%y@J8llFiPN!PN?>E1Pddh1l};_2l_D9-HxHIR!lIAo#^kuR9_#43GIG{==}s=
zhdu`towCT}8M;c%33R5S%L;Za1YL6qE&QzNi6YReGk=t@3sc>xJ)N59g!n!9-7dHZ
zI=Pds_HVQN>_u#Q#>|zQ8-5V<q>=;{DJCf!f$!~af4i|wfO-1qrwtegYEo10eCIoj
zE@}s<sLe7rsWda}G}p<@$^vt{6qXt;4-E|&+q6#m3$mJ5C6W}^Fa6Rl8TTyc=-Osd
z1bugE1g&EQRAqTl_qmUL^rOc9Cr_R<psL2wTTSQR8oJB$8TGjYWPN^xv^tm87k_WJ
z?1!`OeQ)zS)Z;<F_v)hzZzs@`o<LV>44tVF40q2^I4ojpcp9Buy~t7&WINhW(mQ$@
z?E?%ad-kKXdmqNfPh<c3y|{4j3B36FoAJtf--Z`ldK&i}eF;vi+>fIx_Zi{rNrtbx
z7w^FO+!f4^pU33L8LUw0ojvz*>^t~8hJp8DVEhchbrz!wm(V+T60Jj2lzn>%yA?y}
zwT<jUFf)zT;XO$AswvA7=hrd4af*=KgU;b)1A6i70228js=ZDGn%dxx=iv#b;0zSn
z>@_$8g&ccSdes&?V$B&GI(8Cm>QKK4Ju|<a)K4(rgWqLC(Ah}%IgkkYkR|lGb7_or
zcTsH(;K0fXjx#hpv3oC0?cI-~J9lF@mEY3X1ZEjBAMNPD9Rnk{Z+ag0FRb9w<P44s
z4B^1=C{8Rd6M8Fzq%xf>V2+@hpaLA3pT+=T*FQah5h}PrD!XAS$_avPnm*CpiIL7$
z3=?`Ioo(n#x1dL3dxWFtXiA_vn?pWFu(eXn<q5e>(3z?(BULBZC7T(-CXG-&DMd%%
zN!<x{fnd;R+vcMx@=)1%2s#%*=W<BtIexIxll=>!r!qSMMWLe9ml`MwzVCnk`;FtI
zj1;a){m9<@<~JMr>u;p4W>R&^t|FH@l5$fwHxqcK>0DBON@Sq!Zn9zs2}%N~?oH5>
zDJg=KoX)$-A2MxIx$R&6<zE`m(>)8oYD_ItPZQnShd=ybW4Sm~H>c)zQ4DH9PbyLH
zFD}Qs1-&2DRP<8#4%If<DAT+0FnV(13~>p)mN9fOv>oW=JJvIf&UOMX$4E#;KYYhZ
z+6RxKeds7UM~)MIhtNUz4fGwxNH>+u+?_Z=$Q@j`j`hjQSQ@{GRj%8;b}tU@ydTrU
zr!d-o42v_Dv1^0NRESfYpPapdL4t5->H-F)&Y@q42c}M=ZTOG@z1-klBdk@IG(j)h
zzky`uBI2zxXd7C`#?d=4yYm!U`{$AFnnbF76p@x*#L~S8B|D7p)*ESoJEU6NFdE$<
zhPEEmT3rV8{E;T?J$MAYgwPG8#}6!*DpEt-fX4-&)5e#+k@HTY*+yF?g^}KFLVOf!
zvokobrjf&T8Q?p*wiCN&=CL|DfrZW<9PJyzsop`H=kk@gMO>U;!l98d9GsZK>5ZL6
zrsBPX-`c_g=4WOw!H{Wqd<^}g!{{3sLf_D!5&HBoyc#BG$9lRk+C`Puna6Mk)tj2o
zcsyuzdC_7em~9R!Q$L#GQ7ZB*QVdz8TBYO+Cs2|EV=~@k^dpuxMb*Vl4UZ#6?*_q6
zWnt>qs&Lj#*cl4X{*20#3wr+&`$5oC!N0;+HM`L^rFxR;(>*H$6a)m4C!c)MXrA)N
zfBeS=6m<>(N1!n0wV%$V?P|a*Gr^{NP#K@@Tg`N|ED)K>P*bx^Pdz$h3U38HDLyGY
zm5FM*5|XM+Pg!0BWGOw_`Sa(EdwJjc-e)YEW?oBH3|e!YK9`wTL1maGu+8na?1!@g
z@(LAg#v>@`-SG%|@?#VRgj{-*K8()nAlkFN$Yt6Y5veaeBc#p_LT~s8!_H&qo;XQm
zbrfyA`_VzgH!^sfA?qnj(#QHv5C+FFHgtwBFGJbM3mED@iV*^Dbm$~T=(B`f-@plU
z^&Ugt@M&~2+#Q*@#PIe4dZ-@zr_T_2htblrn+oqB!`oE@e4XP5(WDw&LNC>|i2T4R
z)(&04-V@ghrPn^VfLzZMqJ&-|H-LCX;A@93nkM*CMp99mCwv2XYPzDH8FNdkMk8f4
zT~V2y%D?0);BRV*!%OI?iH6VXLcs4qm};+?;cahcJEjS}g{es+>E`aGWvae4>|0pE
z+SnvkhsUwfH;A1*eOSunvEJ2<!xNJ@yS##<bMrW|xP)WMR?L@uH-Wd#bxYGTn5IG;
z+wA|^ugZwc$_KTf>*sw9X}HPuJb$5@(VmDR<#HfpwIXk|qovM@Sgi$Nn+<-S8%?Pu
zG$rX&aO$txL>Jr)$b}<_g&4jDLx==|2>Am>f(0L8r!p@UxGU*`ptn`wDHokw;qAY~
zeh~Cj9;T2|<zq_tsj{-;$B!HP%<xA*wXOn<%HK>=7c|v!3PuWhrN}g|iU+kl)cO9)
zfB7%pr>+D--M<OFVpDS&K~L8$F4NPsRg<b^rLfYEq2P4!;zi>;W_T+zq4$xGe8dQu
zO)Cz$Wdh&Ub%LG%{)RWaVROwb`%z7>`wqUn2|i;5?z!_3bmvFWLD;o551=j8hqg2Y
zPqquKdBRR(=XA6qPcP{jJ7%PQ>Yg}3HFgR^gwy!!c?=8^UcE;!N`OsJwT;n7#xJ0E
z<TS(JLp;eIjB<W%_A>W9fdPWAvxflf<+%vFA%Ss9eQ~Aq&hgx5kRLpR=B{1H^fR38
z+llPJE(|Z8ruti_+FPf}+lj8xU0B(F5xc4OddJt1>7GKSYl5)rM<mrv_;n+gV2B&b
z8sHQ396sgPd!|g!>hv08W~ja1jfxC)M-%kgTG33kp;}6}3h+H{`29X3$DUMFM|-Q0
zb8c>W3QKddSYQ~sNB}PI*MFA4S{fRm4`aHw2P18H%rg|;IWvtT>uWg3kob@)E(ncf
z)y9rb8sYH#@EE43497ZqG2GgPfwn^1LDeX$J6H!p)=qggjZ8d(CIT<*bRcK5qnDqh
z%kD<9-im;w9(G$jd|@BL(XbJ&M(AMyZWD50LN28K#U8KWez#Pe%Sf~!*s1JGu4O4a
ztuqy#fTyX{*}ueo5cJIORt5NK${|ye6or$5fWlF!Ey3@(%Ee^ndcjC#W2Rb@nab&F
zU;CPI?+R}PxndP42&u2Hl%i@@wN4<_vfyW$ny*L&3#`iCqB1<Cv(o-&&z?2zS-{;I
zUdwbnvn`@&TBl=GgRJjCO<-hNcT4I0s7C00?_2!;=JUs|^V-!1sp<yN)-r&6b1&L6
zeO&HAJKwi90gvxmp6_<Pvz5>rJz``)>7Qn(x^Mx*GiNa{PL(!shT-8+42_=Q{CV^e
zetlyMQztn;b_Ro^RCB$DF+}i<jGv>bx}Cr~f?UUbv=M%ji+4ihg}qdHBlDMd{<CNu
zI!w6jN4j^9q4Wg36xCi!&niRPSu}SpP!;aM^4<$r-G4hqX7{3{W86@BS%NRx(u1aK
zAK}-DKs*P(pvUmm;ftZc;ivnIKDhNZ7c#9KMrB4@M|WX_G{y4-U#>NeRJz6JV`{1%
zsh~(WWQ-4?v2q6n`Y@?VhDjye9Ks}hVzA!`L1*}48Rc)>XPmxsX%TzZR<VDB>TYG(
zsI8qNAg2ZfF-b^H^bTN@;qXAd1HI`ydQv%bwd9ab7s}w&d?ZEXnQv)EN2Ueo7(r%l
zL{mcpI$a*jHl;8Yj3HTPfvt20YHQ2k3A*7^4+<Y$U^AV7^SRvcDwE9S)I%lbrn>|<
zP4$`SMmW0(IaA@u1w`e$`<IwrVAHlj?=zqIi~$g-I`xG$Uv!1BQf-2cU|`nXs^UZ0
zR|G}VmH0sSvTN5aBN>81U6t#ZWo9y+SI{#nGE5~_d~as$t(u<5q#$*Dwd*rYFjJZ;
z!C3n$6qj1nwUqBq`>VWB_o91J|77zXHC1R_+^$b1;7aK!R90>CE&I2!g4W|-ncuZB
z!Ig{G`407LdN(@q1RsHy&$l9-<Ga?HMz$k^mi88u^o$-x@5C{}=@<qk3A<SWZ0-Wp
z-UalJoW$VxSxhcmH6S*rE@g9<(J@R_M<DidIzX@u5`shAua|1Ci(zrk$SK2nM^B+o
z@SGqZ2~vg8`N4z84G@eYg5P=pdfh8%YMV34^t#7(V{!L6%xy?@?nZ0htN}cEOZ$ic
zy(pm<Zfb=;l7>5&fWsSw%^iZ(5irKVt+hB1OQel*y^fxKqe4UAQ>~}eUOJn(0X(Uo
zFcp$iQkDv^C7Cq9*VDt$v%iO6SKGH7I@+=r=<Z;s--GeqUQF`^TVm+8PyJ>$c4Ftk
zA{AkwYuIp)`nt9oo%_|_wLP2XA0kr#U8=>XXfVp(=*<jKo8vKh6wL%)jG=bKVMDaO
z9=S#bm7WhBRwp7A)o9$Y9d)%8aCmHRD+{zf13{+G;3Mz^J%P?mz{v%?V(3ZPDGZc1
z5^!=sPn8S;p3bLoKlL2>mzW730Y)LLl$!t{6T}1xh04mdVwx0?+VJT|M{s%3i(X``
zSEFP>Qm*wnm!<-#xlUz`DsvN5iq)LLTiv^AU5mp{neI{0lTuSfhL+9j&H|AMD4kCg
z4_YThSzM+kg(v9hK1^84bq;f#zBjYH(FCrhQm6vxtp>^emd3xCdSZxn`t#ART)Ydp
zG~d-6)m>f+ud~p6t1W|cJKyV$B;xr9d`UM-`o<1X!5zi`!8bB<8sk)Z)617JF@F)m
z3`>WnFA#WlV0z_R0eZ8S7=jXJ1YQ3a)gR~k3BF+}Ks8a(x<Q7tBQv)fQ?=6@pvvnW
zKXC(k-IK>Te}M2~s7vsf(2M1#k?&ujGCYK-6+w6>T6<@Z=^Q8UM$z0lgc#%WNOK2*
zF)F=q3*3G}&qL_B!f12`(V$fFR)(yhCWg0r2<9=gb&>w<3~dP_xte*XHdD|O93s8|
zngU@o8wm-bXk)0E&u5X%r_qv2p*fvEi;@szQs|>Z9--o!A05NO*f^Far?ALSc7m#I
zgs>YRSk>LDHIwH48KVh_3hxtyX<Y5(5>1E@az=R^U!+(lL=U1V5~2qY^bk~*Is|HK
z(Nt?ey4H$hWgWcROVO}>8!UB|2J{>ZfenBWS^}S!%1+QT!RKOVY=TeFGb<}h=ruH2
zVYM~FzPYT*4OMoijIcQM({ovI*_Ao32F7M6D^pXF;$@i>m0UeGREDS8Qkl-7>?z99
zA~mRE)w@CSN;fYR^(TMwC&s=4mkC^RUguIfITK<sb02|H+XXdkR|SJAFp3izn5K1q
z^hbYWY!?Vkxaz)i54soKlU(Q2zM5AJvd(Q@-vpmju0DsZZCdencgubl<M*sQdQy5C
zbe>=NwL9-YXL}Di=qkTUF>1<n@ICI$AlVs%yV-%-U?nQNJ5Vw*bBq8xNI(gCr!Y2m
z7BidB8)sNLHg^eAOLrRG$ppWF&G5FL%5I2IQ?nEkd;*@<jn7{;j_nyefq|(r=omSI
zR>H2G)8_77XdOPtP<Jo#o6t*jE#FXjnVv;-k8bd|J?I)<My`8m6L`ZgDl!PYXmh6l
zy+9-bPap}WHx9cyVnEMo_Zw=@=?P(cY95ob3kB#=b>*nK)K*7jd~t$Dh4}%(BIflY
z8;K(wBJ_eGqzR;SDnU>YPV{6d&d@i86ul#xGAbRW1{k{b^f7epMt`mq?ae8)Qo*$n
za2dia5sM<BhQRbN=L4|_{B$4HoR5GDgaQZ#{6<?jjm{<yQN4Mc1R9~&$WYE*RssKx
z3i!5{!%67bxUR9G7EZSs5f{qg<bs*X=mb79tTol12|U43YEG)oO1D!P$=zN;l3}ux
zj6f_vtKk2+t?<*Tms{`>SoDGys~hvhR~8i0On8`P?qg16CJc(#$%>CH21c=}Gp}!&
zc@Ev9Ij?)Y*?kn>^A9rf+9piRV@(s-w%)6`ulCz|?OXQ48vj;mrzgJwy}Qw&^81bs
zhOv3H^|z5LNd&SUG$!g$-&6%_x(=Q$2TJDFsn8}5Q_&sA=*&s_X-q9%B<!T{t{4ED
zShzyyT{C)E2y#+%LsRFC^^=QNF+w$Hw8<m%#%3>Kap%35U%#7h6!cD`ZRj9+Cr@Bv
z^)hlp2ap}ui){Zc<ob6a-Lr;7`vO(ojG^?B?K5cUoI$o{4w)_iZw$@(5hU|NNM!pF
zXUrZ?bs-ebBN)xX7kWnR6+_SA4x)Ep6iaJ6k!z>a74)d~GK}%nghkL(Bjp&wtU#j;
z%^p9}z93TG0Ad73b0mx;0VUYAq?*vuOeiK|Xm3fPzcq(Gs<|$Ps~rqi^9)m4s1Vap
z?n?!!o(>vkH%tW=qWkGSx`&{1%Y%MHsrh{#W51~K!YS0YGi+5e8kZf`>MGbOD&a1#
zg0Hd${@QvfKnonU2G|@@Xa(R2Vse$wIjHWOo33zHz!MDJgqvA`A$QQd497h5fGR@>
zW%bxl?+vy0JAC-?=EJqs1P0TedHH1mzz<R|#b7Chn|TfspfYXOwqi(_+f21qd~MTi
z1{L#~#n(48xM=ygfT#1DpxU}!+sy0hzPEmE9jEm&9b0^_TbFOy4{7?F$Tx?Ayn4r-
z3}ssx73L|Va)@^|!IN{NzPSc<=_)jK)WbE%cZ6QDwC55=W{+ZY_5>yuRDO33vnyAO
z5O;R%P6K@MndLhOG={7M+wf)>J4(Pw2}<4dk1^yO6*Mnke(f6O*6zX}fj2UD&PX&c
zvTz<NhwexB)QM+7ucdpHu$x7=Wdfnp7(y*02sRHOni)nsOVH&6yFnx}RCz7ERC!&9
zB-(E%JrjCPZ;T;s2#t;a!RMoj381aB54-jqG@vKowRUzP(^_bxn8~&<yktm8wdJX)
zMWo(}M1vhks{^qH8<K7>(&3PidMQPPmWqavBPhF?lju&TkWW$#QpL$zR3=AA8la?N
ziV~y|f=pe!LIN69oGKj5CL>OIqrK5+HlzJQT<=wxn$=3xrpg2=IRem0n0Xr-;phE?
zY)*unZh}nZafPtdwyE^w3Q-lV7nkwrngQO6l7kvGy}CC-Im&yEFpLew;)o`wN~!MD
z*jY7P>dL03CEss)xKw~lNbsK-u9+`zF>o|rT$^cvLohI{7<^lyVw%>Q`<W2gY695S
znp5+9x;I_h1l!i@-0ZqHTPM?c-HZ7dwt~xCHjm%B-GuecAbQLG^;YPHR){e47rA=(
zU1*`gOLVj#)|EhAQ$6ZhYGLW9gSD>?4MTOP9jromUl~efHqK#~&>LSkiJ6rPRCJf9
z(yn4@_uW|7c?~nGcQCvq1nCoVw_{X2EU5AXy#c~cl@&e1RCKDPojOP7_8FD$oubk^
zhT*w0gx+EFPoKop#uc=W9Y#y<PGtHvkfo=(RuIb-pcif#M=&)Ef1=MwAR1~W<XTj5
z(Thm38{tGJ!$ehJWZ@<3yulRJN;6!(CPV4j-C@|M_H51o-H*<mK^#AQ4qbf%1=taK
zf}h6C$hBtB6bm6i=p_h=oXd-BqYEt-JJO9#r0iT52qH&Nv{0p`363@bqlb{_re`)Q
z9^}br2=RaqO;m02u!jmx(Bm{n!1;N8kDWlG0&`GR8nvbZ8v*Cxyw^&&(KT-)(CV!g
zG*SuLoa!y%fS1RHcz;n}z-V`;N&uf+$pWa>R2FA+EsI7Gm(nE6j7pMZ(g;`8_ttPK
zIaT7Mb6K?Ha!6%!NK^S5A^&FMZS~<bpr`lp#=B`Q2zUkPJ=cp{ta=2hVi=moXrHZ6
zkO>T?3Ns;B3`&`fGl5$SLi0M>-#mBmwM{c2SbTl0E52`SFTPjJ7vG0@eRC?)`EK@^
ziq9*%1-ySN)Az_95yDU3`sI7Bp{YF$Zzcq5%7MC!1+{IpsOhOj)j$Qxhe}a4Qi}44
z3Y1K)pTXD?fwXd#kh{dt^eW~Ez198qVP4Q%yMy4nN(Fcklk>M@dj2A&<}PBK3a_B@
z&Ju8E82Vnq%qAE|rq5FKsjToQhGtKrkLu2FhR?a7{l>uGg<9KHG<PnesdXN)tOoif
z_)=7Q$pK@G4CT=aCc5B{wZj)_g*TjIXqbk}-wdZlMx#5u0$z;aY{USblwPC5Zz#Rd
ziCJ8~{~==#aE+Ov5ZCZc=rtz@v@l^tVDyEe7>y**>kgoYo_Dy>>h+@|96<+RqG>0+
zI}|||rzxtqrhpd-hN^Lf$q|<mQ7X0=K^LatQX2X&!{i|615#cNhMBg_iUoz7n+ZF;
zRA~XiFv5MqULQORd97S;BSc-iH$Q=<v1sGGzvdXj=%jjKq>OO1xrIQ>5M-(_QNLTd
z`jM*EO7Jq1ENE(EU1`l5u5N6tot?<)2f+9~3Qc-c)~REU9z9xkrVJ}QTmoJ(^kn7>
zytT$uR{U~HiEV|K%!Gud#oGm;Vqoc9CU}a^Ywn}<=6o>-bzO6RZQlwd^Sq{QT`o4Q
z-+G@W(6*kp80zM6rahNpy=DJ8(_cxhZ&rVWGk08sCly9R!inmn1@#?PRCU*)tgjq9
zhIU}v=ysHhmK1z;H`U#)+cCfU5?1!zNoQzEMYeYEe#~xA<*i)CG<{;>0;U$Y-|`jA
zE?y?|ZZ|3`MyAgZdJJ``?k46gQ0bk);P?p)sj)Mm*Qcg1RC#?<$I&rz5SjknNcV0S
zl^4xQIM6naCWf~HULY|9U#u72s4}XwaVl?xCzORNK-dLZ4B$BgI&a)?JAqfI#314A
zel*y;bPrtK5EfT9@PZe=6g&6qW2oD;`Esj5qZOUq?Pyi)EW_4jhMzgN50j}JhNDRW
zFlH#fyoYKr5<`Eo8GY&l8w?@Ous6fdG3>AtXoQuAuyVT)AiP2hF_o1$Ky?-+^g`TE
z6(vDVgC2&tT&IDFG%A}aGQtd>n+UXIFpN}KP>UOxP{ItIBbgMxmn_mMryz)ORAsI5
zTpoFWjq3$(wK6wanhWGyC(j#@E4b2X31$Y+Hfyhq!d@;LKLi3_tF{YzoTu_Te*8q?
ziMnB0csBIRm%kWJCIn1yY=wdBX6nqetzatNZeCC4E(VWj#p`t3&5kuq+lt|)?OT_(
zp09Yj?$KN>zMrkv*}806@wsl<zuxrieOuo<r+OOy5#PqXi-%DaZ$M2;BO1EhsO)M$
zX-_S-^;e=~pcEy8s<ow05Pqu%@5Iu+t5_!R)(&3B>VfN6+;bP^cil<9%LsGl)}{7T
znQ;a4RDC-SJYaxubpAZX=+n!WFvs-^RE5)v7cnw<5|eZ1Fg8OK$?$f0_njD;AqZzr
zAWe0bXIPu=T0(}NYM(<)<K8w8A<)!E$aTU?;JHIND!e>`YL6GA!ecn=U>Ixn6kuob
zL{M+{8Oa9hUP8~M(b3$fYj83=bRyE!jFpYu_?b8VEcP8x#udHng>haBSy!ln2#7W*
zBm%94N-9nG<ijy^#1m*G>@tL0ma49up<<qDuZc=7O1P<_!w6fs98d_WnpcLi5r&#k
zhQVrvqAVsFPuJ)kwi#}w2v21`QOKDkz;Xmwd%g`Fxq`Rl3RzP!ZS+nR$|@@a*U?+s
zrQYbR1fRTvkn3zWJWEe+dJETQ2sQ&~ZR&pWjB7csN}Wx>wGwu1t=yLHKxccm;hH~s
zWD|PNdN;LDc+AWV#eh;@TZPvOy;XLp=`H_{b2VI7v4?t@$Tv1N{-05qdiwO_>1XY`
z-}^TH3IBjS_Z>iWFV$U-4W(UHZ0ok-wmuuS4cc(qkd4rjTTn90@OJ0nYuI`CZtOkr
z01lpcl+%X{@GTHj3%l>a@}9dfyLJUr%M5FmZZ~{&?{!QpQu(c1!X%fC_I<Nwv9jxK
ztS}@Vrpmi;<>feV^bvya3T9Srr{X(=!Re!D?cYHAzy|WYD}1RJkjzgZqH?@=FWljF
z!YvOC0;Dijp!0+a?hY10TDv<&z(oq6^Jst|rIsT6jNTX?0?%bYuQ`*)?U%3Ojc<Aj
z);D$$be-tv>?TxNp)p?-@+nP|T<R&;!Z5Lk@QXJgEhns+slrkOSBB8a#JH?p8yZk3
zPKePH>fBE^%FhT+LCpl4!bG_$28!KWpH3?zm2%_#5I%CLG9x#hU?$iRdUAE0$+vUg
zwyZHwkvuCEn9orO=Fvh=wYM6;lcx!Em0{)xRJq`0syRdL^1Cpu#cgeQZp(6?9JdpA
zc}|ZVJ^38alRpDJnfW4Vs-6+*YNqydYR`7d|6^V41J%AzZ4Nb+ZB@bl8BI^0pjY_G
z{Vt*R_t^E&UQ`TQP(i?z4%ksb;Fa`q+TXZ2brq(AH7Gf7@;)3mc|Q)Fc?d_(KaK;Z
z9>&I@d$4|xe&AmEeOTCeC*}-zUBdLzMFV;&_gmV14GRo`1-!}m3)s2uKJ46kA64FY
zEKwaw?Tyc#ArwzyaO$W5y^f*X$aXJN<t-9;(+D>YZz?>7w4uVNXYOzo&Oi!wL5}Lq
z=}$1MO&Vo*HdmBz3lMY#g=eIdYV;V;vpN;tdSG+<(AhhHCtmm>yzCXP!uZ6L@zM)+
z8HR@Hj;5MQ72In~TqEPKLc$DTl4?Ol03+y{sa)hO1e{#$_p-?(^63_2H=B19m!k=4
zW?5SCe6brfq;xaui>|H2nJRnJF*&NZEImWO<&<rOz-y(q5rXYXNWf_~!KOaK8v8<u
zwT;W_S13?sxlfJ|)YurE`g{i5cn;p5?o|~TEm>7)Ftk@HDtfXxh2~Tmsg?}VnH-KD
zqtd&nnc#Z{dS(b$9PZr;-~QuGe-~4Uo71h8D*stcPuGo~UOsD5dOIK3hq6H{q31x^
zkPF*K+*EZAlnfDi@?k=6#7VD5$*GHv;Q0B6aQMuFICT0!0`NiXId(sGAEh68fc~HX
zx#gWyee{L3t61NA4_2rE1;ROoxpONFdzY_Zag`zQ@>NXEp2x)OIgC!7!p!pR7??PU
z-qC~T9NtUcV<^3rj(Nlx)&>&;^nL{5-G#6=oP#f#C+w0`a&cqSGaJ>NDl6=qw>mW5
ztq(O;FQKQlc_CPAe(n=0fY0uQKNQEz!ZKd?l9%D~)jQGI)nh0;qg;z2t;)bOeuUgC
z6EiXp3ryUvP0EijXd}#&5Ww^fLP_4%`b<TET);9$UE7412{IFA2JCp8z^EK}nH=Fm
zS2GcXvnC)-fVFkD8SpY)5X;jQ*0xdMwe#36ehv-x)4_A()weLyLTDw?+^kw#sm~~q
zv53*rBNkGvuh+;Z<Mud=#?6+-dMe_2PA#Z!;MCfH2DSHdXf(8&jC-pm%<#gDUaFgg
zOaIwb#Q$q%{wO_H`Qm?8Gn(hg$@6dV-`aiqvAxfV(m^M740}*E8N{|JAGS>se$yV5
z%!KG%H^b#i*Pf4aS09JqckKMbICA!3!tdb%_>RyI-;cGucN28iu_5?v+==x)*RgZ|
zeORQLTOkN{A9w(}sPLB8?!w&SWlYaq!16i)xq8JoZg~0xdPWYQwU?po;4ZZGts&hx
zk3>#M1cnh!4I-TECF~g9232X%VgS!Z&^1!oH98r_5_S!CDm((NzR}NVzyP6*p{};6
z8(N*!13SarRHhvVkDkEY_uh}Cl{Eu)tx{5iN@urPZ4*S@okn?Bh6*Vwm81#)L5KSZ
zR05Qyh6>zVX9CQ~RxF4al?Kmtvxd}ktuvIJQI5Fj3a@!>!v#6IQN~DD*_~8dYgggk
zQp%l2sHHMgWi2Tr6AY<Uz8DM|t~P;224s)RXw>YsH^ODJ!fvUBqrMjQx*FK(s$s3I
zf~BUy=(}4}TaFq_C93Ny`JL6E&QeXUA^hr$_JbND<5_kCc$>b(%vw@Y;pqi4mv8xh
zj7#0=@32+f{Xg}iq{odc|0m9@-g^Yw`W@IY<i)mOFLum?QMwRD*=h)->k({UPh#6z
z65ChfD7o|g7vjqGCvoxGlQ@6J^Ks(BBZlJJcl>_rF`#!JHuhh~o`d&c-{A+b#*lVx
z_uT~D{e<8>*n8+f96t6K4jy?FyZ7CX)s1Te-zAJpoW#)haZE2>z#x@gAC+F4(n)o%
zAd#Oj%JQ1BV~Dj35qN!6dY$lw^Kkf5RCi4U*wIa>X;AMvsyj^uz6N`c>%*KX(Ll&h
zd<}L2Aliby!ExTxDeT^R0NuU)2K1!l1fZ^-ZUb)lu8u+_0D&XV(Qnj%5>`@VCftl=
zsWO2~+vz6c1X43?Rk3UR4L6h?&z~=NR!Wk}Et6-cNrje7DV=*9(P#w0a1h>LfFZq?
z;CI8~a-iNpWhS@TP*-n3ZFManR*%NoI#}!KV5zM|eGS1?O^8)hpuVac4RmXfH&mCw
zLa(bSMQv><YU;~TrD<I`&&lKJs|Z6LTVI2j>B-F}YICyHX+KCg6jyNEvi~?Ue|-Mm
zGkqpLf12=7Q)|i9`%mDuK@Y=PKT5{vgx<F0D9U!nP`*Ej9s6_Gwyzc2_U2LY#EX6s
zk39d?xc8Bl<H|ix;mqa7H`U%l*nj*%>^({qckmt@JVq6G@-gh(cMn!~-G!C)JE`pM
zz{c+DgdM@i=?c}|{L*DiOrOE%<SFzGAI9L=QFQk2MQhInGMy`EZk;FW1iW!1a|P%H
z;$85B@>F>lLa&*yi^1wtGZaBBSO8vw$8Zbh8wfoqJ&R4<&ZzRZpN-IS`(kLxc41_E
z77I&j=<Ode%JS57L+}x_1SA7osw|)jeuitA>&;Z`+Rvy|pfZySoH@cvO+J)?)lh|I
zSiI@FhN<?<>!}T$GX6%RF$4l3csxG1TyBPv4Gbw6T3Q%p)-$}cXr4f$S97`AYDE<x
zS821~@G5$BqaC$YE?3o}s<MXDYE)GhyqeIfsivAE<SbQXXs9&3)bRSs?WnEVjv9JB
zp;uG019f#}sHgH%B7-_AK}%f~rYFYuK<T04Q>6#%Io_M%mr9^BT~@3*Zux(Vo3`~w
zsJKz}e@@eLNyVxkUJ6<D$60;o9Bvy6VcTSw;VeVh#TZJ~n^1ZniSlDvl%DLy_LBqH
zc6<mWZ+Ppw@#?4FjwfFHG_F7NGTeUGlLX)MaFpOXc!G*gWqXJ2#o-eSe@{P;&{K$e
zm4Lg0itZv7R~W+Xy4wKY%=`sR&YZ>g)M-QI^$s1x@PslcGqff2((OxV%1k4knLw^<
zj_c;p(l&umQ<o9enhlQyy+%UM1YYeX@an1bOz0KVT^LS=xGuHXb4TD0B?+w_^bL#=
zc*97wWC^PRuuLc!Q@WI$3B6W9Pr%xQmXz6zAr$jebcCkVS*j(CWJ?R0RrsH3Mxr@^
zcp`2Lw&SNdQ|EoV)4>qYhU(fHRM*rPX^IM7ODJ*Orq@uJRaaLTX@{yRt58LF31Ai6
zR!ZMdSC7($MwHu~D78DVov_<Z*p=CwsIt3IT5CZW)m|yRjEb+E*Q?+;t7;11v(Ssx
zUSl<<HH4d-kh9hkj@3J;5D7#oJS(BsP+JKbmEPRc1Rts*=oOh3%~fXTR2)JHgg;0H
z-m-rmGeKvDvx4CND-{`qCy<{@&nG{o`tYnh?;>{0#j$;f;VTth$wm`O_9n3HC{^B>
zJa*hZfbEwiaNC6$l)U=`AH!RJ?t^&wPyQ?(dGfWm^S&43+|?%ty@#ms?!lpx58>RE
z7hx~M-1R+oVR7ve=9VvDVdWwg)IM+VcB;FJSW-K^WsQ$Pl{cl(_86xp2)(1|8rXwO
z$1;`OEP*#^)YuAmt=+Ro<wp^WcNmS2oxUV&?)Z(6)hNRg^ek?=i{PU|BJdg=>Uj|~
z)ScZGCiG$mMp8(mT9M9nAd@4++PbLFlr@>kPL&u0S|Rk5Vk6ABtbF+bpsDUu3#;y0
zO1~b9Mq!LC5e&lb^BJA?m8GPip`oD2>If|Yri$UD;8g%vh2h2XRfJk)Ip?=}1-DgH
zqPl{|RuES7it=idQoZe{u0yG%5x3P_QSSDkiVCrWpu5fCMwvH&+iWhB5P;il4(uQh
zcksSS8?C6YHlVV;7L`<dwN;f=b^>1ofmaDDy@B^@F`SSi;1u!-gpGury}lZDekPRz
zE=&@70@oJkX;wIsYkuoXWG?H4Qx3f^ed$a1{O3P!G@meELQ@@Ts$n9PVYxW*r>RhM
zE{%Pm<4tAqm9Koo$Y!jm1_@W9Ql)k>yRMn0YpMB6PfyQ{c4~6vyi*@#V}K+jTcE2i
z^w0g=&ut#5k6iSDZ`qHj>2LIHeD^z2dw_EJ?0)GTDBszPihW6x9co62{8$PlrxGZ+
zJ&SF34PwW=v-CBTeBdLW!h1gaN&L(^-jAnV`7;c2Ux7>4UVsB9?lIKfm3v;!bw5S;
zJ&cuISFo^pJC)uQED?UQi{~*$u#HcjBm^&Eo<7F;0VO6KI>6~6^p6}wd#_a9BB3{f
zSb7}c<RIb<Yh$T?L*=;w8Q47uy1JCb4Fy-Iu<%mdN#zN8PSiKJ7#4b|r2JG>QFs{6
zdi`<u0!>s~DXK4qlUYJ6S8z4RRr@?aPf(MJ6ZG1t>{N3r*d?ggf+2;S9wYryL%oFx
zh)Rs=%k*jkPI3XOoWLrhml9IjQC_y4(;bFuzPxOQvAwFYyx=@W$5s(^TV3mG%6SdK
zuBuGn6R<2)Z3I}O6Sr9iGnW^o1f8H);`CyNj{tOf3J`RAvEA##4!+Q3ZWk)3GAjwc
zDuPQ&(n4=w*xSgdRr9q4x71V_02B~yT$URZ9-KPr>uw4?O-$e!UIYhI-4z2v8I+Y^
zQ>oXLDOuZP=2ZFe)PPpwQmC(Qamb~4jfSR>U)wc?g}|s<?%LW~BbB?7d}^Civ~mF6
zci(*m>`Z_Pnu4N!q_nS@_*2T-oR=$E>6_m4rW;U}70=(YA4k)3CZ(rm4p2tM-LJla
zs>2CX9E)Jbi7>XGP2#rm&DeICYVTSG+aByg>Ekmfd)|5hdLQ_e&*07P`XHYF(%0k4
z^%vme1*yDi^t%o4ow@uZb{)JMOFJ)NY2zYRc3vUiZpYO88H`LG!^G?<uDgoGjmsFE
zIAVmd?R~q@Nrl(Zx04~{3R+s{(43nl=tdDr^i$n+(+iasc25#krz$N9HMLTBg55Lj
zhK1h1c`Ko3BlKLJFo73EFq}XnmO@i9i<XSSQuRJ4s63VDWmOqrz^+iiphmq4OWi&%
zRicwh(n{4<XS_TG<;53?Ua1#rN0FBjWM!NhzJtD<``l)@<_pW)jBVBB+~<au8Rsl^
zoxi4>YRqs#uS{Oe^|}wMF``?Y+mAB48|7X<%KZc#;a9?S+q^*n&s|ui0xS(jQ67$<
zoT{b5?M4-qjLHM`nN-1{$_}H#!>sh6*Vn1yqsG{7BNS|wdNdMt0;1;Ur>Beu%6OtS
zO>;MZNB_<yytXPMEh}-Ta`P2~<=yXo_YL65q|gK~Wk%LM8qC|Y*S+p_Mpk027vQ7_
zW$J05bsED%18!@0NTs9KK04Q#GiSc98I9DYdca84>3H=P(Y(@enfIqbdx~o{Z>hZ>
zchlcTT>%RjR{om*-}CeZ)Sd~W`ho`)7v0!#C4?P!MX}=^0`S2&wmpv_?o&f3d&vSy
zKK!xIQt5pHuYKds;l9USj<c5^!Qs>QV*l~GvApLZ<~PoBdI^iW2)o^vvAX9f72jpd
zE}zBJg8JMZ$He?8%&n=n##wX^?=vba<T=7F+qr~v+Z@B#86>llh%&Sd#(D_6E(Buj
z@B}jqX`76YR%Ljm!jrla?23VBv-#k32KnHI;PwQm&|-);rI2dLA(L&RqGMRwYC6MM
zDm+ytn3W6apzja*VYOL#ehaGk@>fv-sq9UoUsdqstl$e&O;FWvzFPC8gxHSlhF5JT
z)V33H+e=VeMioQX^cml7cpX(>4V7QD;hX2HD1VmMQt1_-SIP5K(W_BcS%<os2GrI!
zqK=_#rNf0PLa!!7s1bbfGE?ahe%rz^RK%00iYHJR3Zuf~HK13+7rNfk04r5%qsG>t
zH*#Lfg$k1fetx+X4GfJfoN9f8;KvYoc6wTG{^r9~FmuU-p0Uh*idB&b4h@1Ml_%I~
zupXsTmr5(nlB}%50+qID><fWQ!z5~e9xa<`*)=b4>D*?@b~E+5t}Pe*w66fF@V9uI
zAXuDwO4rtXYm5tXcu9@9d&~YanfWC0`Q!ipC;kEZe&!?^t}-gT%YvHg^{BkpiHZk3
zD1XF-(kDFF{-P*KU)hSX*9=MNeHw3i$9wU}3to+LS0BdVGuN^I#5JrPV7R*T9A?+f
zVtVB?rj}1(V&Md)7EfVvL4ML$9-Lw*JGS5O&Y|7N_pTw=vx;VduDNX%&8>yzDaq^v
z66sMy5(Dr@RB@4mJCHWQ+6H@=prg7|`@2S^doP5vbqr^>0?*5^HV}*>5>;m77Nj$3
zULxQLa&lEVsBABLqZZW|2q+jZQl!@qbb?)jt&t&fJ!+_MYUwJ6E2oMp-F6$w3AYNu
zt?~xsZX@)z(QiWym0fLV2`puFO}Sjhsp<8CAJ^4T_0>}K34S#MUoAskc^yMud5tk&
z0Sm)tO9ewtdVM9;S2e*$NLAN2pu)m1mmzmGVOHx8pe`6joj;690<Sbc5b`^yA@u8-
zlBlKHt73>=&Hd}Fjj+<~gqefqklQ%lNCnuy^>qYYt&6IVUPI+mZFiv7re-k?G}5PM
zW{t<{W@f@qwl$>HvRvV;V50t>pZ)A-4b>)B`JHL%t|sVdKec}pAmz#psHw{16ut^_
z=5b2#ul+S7q|Cgw$_&kLSEG*=t2PZ1{?@m?^@hqcA*}(xb#9$c;jL-KYV?-<xElZN
z%BQypz5l@g_u$(P!FfNUmWOu0`q&OMJdaV$3n*@0REMgU)}i7x4pctPcjpZml)UZd
zKY$m!>~#!b)h6$DtnELK#ocEyvvv~WOUE&~a18yE2QWbI9@&ka(Y>7R;dBrB#`mFf
zcsE-6*OBd6L8@Z`NvS$cW4TGhsqB<YAV%25Q^SZR2N0sl^M^a&@~7eOCK=AgsP5Fu
z!jIYpmEoD?c&^Pdyk~&t^J$d+X2jyk4W|+Q3$?4J8;y|Zs%<rzi)1s1Cz{}4C|gIs
zRa8}?l5i93stLNvx*Akc=~ZxDMO6iV`Abnr7mz9mxN3sSfS15VnAMkWBc!&Yp<)}X
zgr2SZHcoFt17T-4_mS6gU48j>`VQ3beDZoh&H!D(Yc*BPDz9x-xm#5=r!{Dx*Vi!o
zRI?R&0R2D$zop8x2rC0jPJSm;c{MH{s)A8eMVe3-OQL~pA@u6J{O$xUf<t-xT-ImY
z$m7TLf?7R6S3_kebyyV)5u!fS&@FT;0Xwy@U_4BwqPv+D!@&fJOygWgH4A8}gpkS_
z9v(K<X@DM$K_OUu^PAr^Dux7|_rL%B*u8tVF)ui3fFF%Hq3=Kn@N=L0oKemv=t-TK
zBa7)g0-VlihSRs-e!H<viq0Ix?Co!V`;C1~AZqX<?Jxi7pZ@7vu=~$u`oW{~XO~Lv
zf8gYM4j}qe141v|j=(E!gXiUyaJ{kyj-TK=_LF=E-@s_<t&OO8s~;s#z5I2!eD@>R
zd-QfJ?K*+Eb%v`;M=-Ih1fYk}KXm{dLpu%d33{EJZ|z$(x{<YXE}*${9;vPcWO|p4
zX-o+OsOo~L5rmqR8*d1q<RHRL{Rl?8;S06H8*GKsmx7&Pt<@3PgkGU(ih!qd?@AnC
zwW$e+pP_8XP<PQ-3dv-iz-u+sTyf}XG&NC;EM3b|cfoMT82m=fM-;}IZh($zuUuYR
zjU7~8+o{S*8Jd<6a^(bG6{ppNT{U4=vqRzRGcHAED=&eQ&~p%kHiFKkX~lNfDofE=
z!C#w7{@NJr<H`uif~yIMg>Y4KlDe8g`#9C28kGq(Rfen5p^?Ba%GwmBG7PWb%dc>>
zo+{1KU^9letnr3W<tOL}KP$sr8&#ly)#xmwg!d4VzD@VijS<mYE;u|?&H+C|;yB6^
z&8Tipp*r4#nrIaDVV)xt!u0B@k)y2eTUumV*!1rdsVP%&2`E>tTrm`s03`sbmR0Z(
zTvUmmvOJlR5|~O(qoZjENrk{FZxitT?9cwpP;UaC8M3Oqqt2u8YKtMIfq8T;ZPT>`
z$mc!pc{eI8e)ea7_E}|py3c+4_8Hp*)z`oN^*6w6ewJJIV{G~p8NbL>df)pN{ulln
zS3h<H-EVZG{mqrgy|oOfH`gNhmO3=Otpee9l*0e+N_gH^59fOvC^>!cI(8pAi-nCN
zm|i}JvH5)rS$7+Q^Yo7Iroy9|8`_|%+sn{(+2}r&Y@J3TKZ$5&6n?6=U}^}>9W!X@
znnOH4fpB^R{zN}~@m?c)N+3>k7wt5F=LzKD@n;FU=w?`}lu%N6RCffP^5HeA53NJM
z^TN$gPGxtocr%()S!A+Oa;lNtY*R<bnQiP;8{16SN!_{Bn?X%DsHDm&%27szP^R#;
zlyIZ++Fs6Zs=5*t1Wg&CS4ve@Vb;hBb_7`s6<eL*3|k2}3qjXVL3PJ@8^Pz|dKb4l
z2t^l{-8|Mq;5i9Tr>0e<uveAAR=wFyPHpEln~l`ha=xw-);hwDZYTU4bZ1>Pob~*L
zs;`B^Y9ahAXe977{)N4fZl&|PsioKO`?0vZuzUP)_<Z~kaU0`b7y}+r6$b21_z63|
zvoK(ghf2^9^r1e&@Hmk`SvH5Nwl-8|GN?{aEfRkAa1`@v8-=tzo6nbFnlZh|Oywj9
znPH|06NRa&Akw@UO5SzXT?Sy30Yo*sQey9W-}{VnXc$Q;K$##Y;OUs(`mNtGl$$Cd
zq!?fIs#h8N2xRgnpM3JhP>~wuQKkwG9jkkI<ttxljEJVu%>Mk({~Uk&w|{F4ex&U>
zPD;<T;#%Ze_G4;#2EWB0DL!-m^xgl9zr~;8d7nOowRgm@_=|Q-{!%T*-e<$`2W%Ml
zU_E;OOD#G+UWfcA8<6_dMwG1WJ7oZGYH2@4W_Dp<VjVprD`@SVr!N?u@0mxsgP_Ze
z(Z|u;I%#Z=XNTd95oYl&L|O(AO%E7-XpJsqMozpuT!Ac{{*2*jn&LJ*O?9WfvH`+Q
z3eQV-qgH95sA3xVBD;J6_!xQxBT+&uVT7szoO(%^2{O&FwvZUJ-2k4KDn_uYrg{?G
zc2L0yc4dTJrD|RoVwO_LscA{U3(Za{3B5`}OI}6QRz--_5^e_cIIr?M8^c;VUBGh_
zeqKV*$N2y~Oi;#3w<FGT#(CWsLF(r@T|Bpgs?WaJ9!{WBuZKoLt$_+uQ(JvC99-wN
zRKwR$15bSwT=o2Avev<E<*yUL=Vo~7wJZCw8&10iPB*_Jjf3HE8DMjJ91P<*@ADwU
zFjeX0BW^$EHLPU7a191zYvPYgrU^T;Y1~HO-PYBC?d^F~r8BVdhs76c!p7>J?}M8O
zJWT{c)!r6|wX&z5e%b(uz<2%nbwh2b>4~OhIN99XY_zMBI+GGpIIGGF!OJ|KKq^R?
zHLPZtp7O&<(W$KPwXc0GUh|sQV10evSf}%7&~3BQVq#+AM$ND)X{2)1d_`ff#<5j1
z7@besHRi5NVBPweZrP8aNsWJ7&{L{`f56}2Pw?hnzY`DqQXlU4U<=NCB#1*F_hRp-
zJlOpy7uG-P#PXM%nEwqYroLiB$>Ppqm|Uc~n_NTh=rX#77SY->gIw1%G98m>X(QzF
zqll-K6rc~GL@y%E{ahZPife~6NSKAP@Wt9T)4Eg18Ts%8x)#{I&1iHd47a+P(BO!{
zVvi7Ve$-febODblk0EVcV*z+h0WTCrR6`PG(wh|$QgzBkLddDjodLc=#f5rGgc(jb
z+-ksVF+$if!fiVtBxP4YRaefJLeQ(`dNnJl<Tld_O;8HWPzuh_R<I-BRE}q5*lH)Z
z+zd%Q)n#y3?SQvx8~l~q5ad1)?%%}Yk~~KX&!3_<(L;ouhY)o#%ylr7wbfO?N|-ee
zYF4$2Bk+_Mz(sdkY6!Xl><r)$e!j+f0<acAhUh`6JddRje!JU%p9cT&Q_(e1#U-fl
zLVg!~ekXixIw2V5_ZTJU!oCoqfe7Nkh%pqUJKF?XXBu^#?WpPQK~;Y*D!V&T%kS8l
zYJrE~Ti>~_z`RY+8I;?UeFpG0R~I)5mNHY>odC4BxM-{^4k@MV6n0DXNqMRER!|eH
ze)*Sw+1N*FQQ(nDk*QozO7S;;^EZvg&H|<&C0Lr_tN^QKC?`&wxDlG_z9uIpjqL)T
zsRE_?l<ZU{xaqt)R^e^2!n;+z_hWAQJL}o{?zb6{d<*}CKf^D6^iI6&ZG(8_2Yc~~
z&vfC1Uunf-zmdiRUrOPQ&&6>5i$R?IE&A8_LpQyA07Fyj=p39!d;c8TdS{VtA4jTn
z6v^B$VlDlITrYx6T}BoZZ-iju)I+#AsMs0_H9O%Z_&I}V19XL?0Ve3;XmG|*Ptesh
zhEUTGKy|%$(>-)I!ADp%x?yFw<_UxmiZvmoWSrToq2ScM&MdDp(>bXFe=gU`7oi0K
zm1j{2RWS4^EmNq<P?jNWDPgCVLoZ3$4u-CL;nZ%fLaLAcOuM^+7n+xBHZ>v0Y6(1}
z@<Qcx1ea65BlrY8PxW^A=z*&3h!BWzhO-HV$IVscNYdl<Adm4d#C1{SISD*FVb(~n
zH8xbkYGp`EcU!99Bk+6;^@Lv?*ID3kJK$`z5`0$pYU|Ntb0Wsi7p%7+PQ{{>^a7hg
z*f;?w@P(=JLO~B=AwQDg5aJrZ+^=->5hNJ0Ch5_56rOAh&bDTF_#JxN`8VzAhJ)X+
zwIh$ZR>Clw#Ps2PMkj8=L^wsTo1`<L_Z_w4+cXmrYK!-t_q@mGNl^?AGX(zdhd+#8
z_=R6El${jWFaF{$;vMgJhXJ@^)3Jh*dP-;?RaD46^q~(K%POOL=R4nNT<>Ea`<O9s
zxCt#&=_$N5p{dFbRhp>6L}uQPOy!9m{NM*SmkMg{mi;F%JzsnVH7w9~@fUdSd#>Oo
zf1(d>d3zt;`N2NC?GxR2*B1uxp5LCryT37mxBl)pp8nk-y!v;0P%<>N!Ekno;i}Z#
z6f&I?NahK<bRWXaJqR>)7%EPXbB7CAjpcSf!RAe(L6s360*>=G)y5KZ4bC{~Y)a=I
zp|UHu#m;cHF=!|}!A>D<b-fGK7N-%`TAe<Iy<sEGdor!8zzkjaVw(M74YkMRLVBlm
zw6tUpjl>zYcu-&8fXb>Gl$BRf-IZfoDMMJoPS7hip~qkOa(Y!|g`xZ^IWMo`3t@Vp
zJ)OMJ&aRd!O|`ZbPOWr-Pl`_=u8Y8PS8p>Ep9#Jg0cSYj7$N8aoJtwG7}h!oJO=^i
zpn`KW)-Xh^g2z^c03jG)nCsK<k5ocdjgL!pYmZX720{o?r3KwSW27{{%MQQW4xh)4
z0K<5Vw-KkJ6Z9e;mjS+*--~1@fM$knsc;ynXw-0x%bSQt5NILz(sB4y(bJMeESpCt
zn}e%0gT}TN)VC)wbz&cG4E6II&?{&v{pc3Dl$lnnm;^4fhlKVkt|Zu6VPJ;0=K0KX
zYT1Ndap+mBB5&3WO(2z;RMudH(kd^Mnb$S%QvlSu;?TDE^W3r@U(;XmTYMtF_r1Ts
zN8Wu6Z+XQqe(rTW_{BH1;pg9(!+YM_j*om~1Rwv*3_kXid3@}f%lO0}ucM@6fa<Pq
z7TL~8G`EhRDLX*ebsM3p2|KAcsX8e-n^YX(rcl+wP*re~TO3it1-e=)I=LWMZwt|b
zoG+B$33yd?E~+~!JT*&Erek*i{%{;ksSHwl2?ab2USk4J-o|BRJ5h@DWHQAUKLDH6
zL5QhaQU!L<OUqR$LA6&=g&pM;gkLGP6Kn<msQ@d~vA?>85aiV8^IGhcs39C{3B}sV
zaw9#Hh0_MZ%h1TFjZ?W@;cAVP-wp$QUQPp4e<1=dOvr`kL58zlsX|VTkd?r5S*i{9
zSP3~>C4$yU#0k7eods@(8<t26RUK`p85u%Ne?RKFdf@BnM{O*QihvjPR1|e#CoEz9
zNQAsp)Lw)=UNlqHHTwfd`h7_FJcg&jel$mdXo*FP;VSd-81fpUH`Rnhwi%&x5|MNY
zE$NKW-#D2~Ba-L$+M2}5@qG*#^y2fe(qxkX=IrK{XEpkj2_B}3+NzjLm<d)URQ|a#
zw41?4`<urUgHL9{%bb_#9CGtsi=n&~mZtrnd%9&m9;UzUclf`5GO~&OH9qtHNAOFp
zUBHK5KaP*QX#gMo*)DwOZ5{ZPxAo$q@94)z-!p_y{>lVC|G7n!2zHr{2_u}1we%sF
zkg{tf^xBN6Cz2!Z1UsrYFIAip0=QxZ@J#3#fD_zQ8_Q{}RVmyHpexkOO4Ydy*j3g#
z>5d|G7eScnPOviutr6H-1vx4@s=7j2b@id`FjAR^7^XQ0h}s$pfmcWMR*mgD2snCa
zMK#ZDAxs-lR$YtjDkv8OsRBzmE#tJD>jlRu&R16!yrzm_5SMGJs!>~2g*r8rsHUrV
z2_dMqbZSmwmvi2sDb<_G>72Dxb95g87m&K+x&T92KiyMT4tG7L3}@Y{pintoLoIxb
zweV8q`5E3OEDebByKuMysBg)kVt5qWCZ|x+Ka7gO2{aB*VOuJP(p(0v(S9^^QDt{!
zV9O=pOhg&d3V1<e8RF-HAv)*7gk3a%RLqZ-rVz4C5p*=g(ACt0_Cx}$sb-|p1#hR~
zYimg%!}(-}Ak4(Ead@{8HV{*rk5>V7LP25a*=7R9w5@>o=OCi(f~C2ijuWVg0ksuy
z+P)QXCM0$JV(6J>&TD!zD9g59=a&6=n*Ln+ixr-jzrknT_bA@;nnir%r>F6Ww~XV{
zKR<>~|J(>Z`}Q&X+S|wR$+r*TV{hxjN8irywnb%kS#|#JMTnuSKiX<kLinS31mdmm
zMzV1DsqEY_!j6hg$pD-RT{S3=(z#3VQSI3UKEaNPPN1_Y>1Uzsox<3vI;SzMwYs^F
z8xC&}euk$_sWg?H!dN8%P*!7QNKrZ`jR`^MaeapBF5Z-cmx@O1)G7$7GJ>nTf)JyZ
zmJ@tjE>(^>0<YfTM0vdhWp(wa<Z_kO#^nYsTME<aI#klD={42$n_g!m=d7!#D>M$K
zn*g+O-d0;%XhTOgI_E1Z2^E~eRw+3z!&IN)nx|9Qc`T}ss3zp<;HL|C!G?PH?G`xP
z^|0A%5wp7xvHSV`MqqF2N8{WoN+*|4lBe1mT0&L#6mHA)qiT2(jkEKxO-;cuK7!c7
z1i}Lyh%_e{&PI^+1&tvqb3PyP;Q(^U5L%J}q?-xCWEfpkdVR4bdeZR0L^JxDljtVw
z+6cQWA)3lk1v2#Ab5PJDgp7YGHz~rOX6WDjb^#!!vMYwm&Fp{w@Bh6y_Y7FfV~Wez
zZe}KsG!+<4IGV>l7r=^5>rCHjy8hO4+_E2Y)1Q%Ecmn<gpMKvXc=t~%;=^y4!AE~~
z0-yNBDSYA=#_`#AP2<<!IfKu<a}u9==O{k&bE7DUxAY^N=rTI~YcOrO5_x*V83He3
zj7TQn)e>m6Hb3D;u&IwMA*ZY;X8ya{Mn9?;vP#hjZbtZO(bUCdr!iHZTAMovr!Q=%
zyJRMd9HC}_ZnGJRQDI?JR1~16VG^U!I04~gXj*5$ZhI*eTv=hdy{r=3=sN_$+5+S%
z8*R9apxQz3RXbd$b2F@_I+WMCyr>~)qzJ327Ui|-@5*Jvxm;IM4+{Y)0LmN8zN>_&
zywSpWOD(+ycDi%3_SIcqP;&yEmm#d%nAXB?t4E;Gf&hURvNj-SZ-m=tgUw+<)agOU
zL6zqY!`U?i-`ZZ(&aI=QeGKl|{n*h#rPnoqs%gS-Yz{T_>i!`FrbprJ$sv%4ArX$C
zEfg&bL>>vFlgh3&MdjB*;L+P!xQ%e^OSWL3Ig7z$7Q@LD1_{01Yzpmpg0C&b?>~iH
zHi;vL_whmcwt+j9nS6^+*A1<v8Mz*~n?OtJ<mNiT#N6*@z>=B!YMTiqb3a+}zNVSS
z6e}|wW5UZkek<s<o~!se=Dx-2ZrP8i>F-D{B)<4x_|kuQ5+C`Q-T36&R`Ka~&Ew-g
zKY>sF;v_!*?nV5@d)M%#_bhRKmd+?jx$LCw1iL~rlN6zsG)5w`dE*3L)KGZ!4u-2#
zB(*jwIKhvKPT*69V@<0q9#qyjP*v|R%IZ`>QElNomo*lx2JCTq)b1{cSTapjkt6UJ
z!gh8Vpfggl(_33L=#EDEYB$D<bx^gbqC%>yoZ)LJ0cR?^?Ofi*>1`^*Bb<!P!h~Eo
zrzN!nB&W6NuulbK2}aOBFjNpS<&6%M*EXPvu&b%H7CGTpXGL9oBd3~gKm(O!9j6wH
zmGg~8zgRl~tWNrBa$zL^jha@v8Wg*&TqXoP1fASxBkb%}gb2E@&4wuNDPp%H<Z&S6
zvLniS38~haKLXC49@I~aqk4J<r9Go)7+b=Q-YHa$FXOhc1=KI@MfKn!?1MA#^!30;
zNQDz|wD5cCpla@k#Lyj&qB9XeJE7M`;I+4e=`nO>ThNu~4@tg*Ak3jBpF?jx&2=s4
zY0sjwHI4RcGrDtWoH$76DKjnUCiH0x0-?k}hj>8$-WH{&^0C?3Sxixhs>#a2!UE>z
z=CHK1WUMcqs;k%=-}pwOE#6khY1ssm=GBl`wY6&Jr+kO%m7z3X>ba3%sIFRCrIk`&
zUfs)coky<9jYl7S)M#3x<}c>hu)$!^$i&ju*JoVk1uuBPjnG=>FwavA!{YVDQ*&OX
z{dHZ_wkq6P_QRUKQ~fOp&nWO$_{xVK$EV)155NAitN8r87V!D^&f)X#o5z=bX_>x)
zFT8smpL_QVzVMzIl=!6X)UM7*;@NDdHzo2k2IbM{V=jiX1YSLXS7-B5(Rt`zBXl*r
zhQL$!D&PrpDzB>{=xQ1qu-II%xqR>hB1X-u#-q)&5_Vl(Mn#2sQ3!Y%IjpNo<FU3P
z-I79x54CzBsI0AmP%Edh+p(h@+vTdbsI110>KbgTtU^fzm0lg8XXo`@Zd5v5*iInT
z_yZ_&QgQl&Xk=*Tpwg)i$FR-Phzf$ON}#Lbeumo&udTPE&O%t$Yu<)>eui2uTkRg>
zbJR9)dm}+C@X6J$RgHt`PF~aNbim`V!A;P4=w3%7d<0&QpbPW<B8C%s+$Z9+A;IIK
zZV$Y%FkEd}G!Ay7adZf=#brbmR?#>y2kZPAD(6>WS=kHA#5(+hVUP+rmM)B~k>O|S
zq<0g3U5O~VQuH)cAH7?^%Qf@6&ZDKL8_7Yw@FOEg_IDxMr)<(W0*?UX_uiAwqL=fh
z4<F>4$%n~k5>SAiO6d%|>3!96_8o5HOKmDSbsJL;3$txp@ziv+$x}N$&8sPq^5Cfn
zi_T+0#)O2XYCt?bK7OM&hj~8r;xO0ARWq;N8%jwfD-NYi5WeeO?=srtX@50gd9HiZ
zva-1dwlaM--HTZZEmN7H2`=6DR@JHNX<hMWDK>4tWj~7X2kqPcfbV_Bd@BDEU;p?E
z@oPW3A76au27Z&u?u#E-#Fsw2gx~(q2EO{io%pQ}uHnn?U&NQ+H-{3Bk)l0y19%Em
z8ysOn!8K?=9+lHk-3fLzR)&aFaZ+*`q(>pFV5f?S$~vkn0gsT7vNQV5Iy`WC0|-!s
z#G6};vb$V6m7U7&HpiLB7!8f7>{M|<05(yncs&e-Yif*UBx+w*UaBr+70;@r-A)j0
zuPXSq>S~lXP?=EyGz1uW(@Utp9El_<Jw8|&9$R=1)>r}!(I)J$@_q@lnoSqz<Wi+d
zRw`)MS_@OvF6*;vj9pi#33U!1sv5c7%^v}Jt-a8`&_TBmh8~xP?l!uD$-Qb*M^~G=
zklNER#13<P$Yn#!Ye%!ki6*MPK!E$ld~i~kxpE2k2p|`K3|xZ)2(GTcvv&^~H+G?J
zb{Y18F*LPgk&8DW-;_XGQxe??sx&H}u9hbBWSh{NiJ>nOM}H=XF2Xc5(1Y;g7`!V>
z@a<WHdwmM=<#FT&`q0I{$zVq}`rA6uO@)2(=utjPd{I?B#s^HVvKoyTuf103edo0J
zW&S_^=l>W`D~5+)q<M2HFNT#|05cm5n;;Psl!i)8c(lLHYaS<PnxHbzt7V->^XjEx
znhB~;e)5w>_cil)P30y$i_a<V?(V(;yJA2USF9AnPG*MH#rv41bD1D4hUYE&VNK78
zzHdF5n*KR{=QA(GXWnrDpZkTKRDC<~#g8(i{n#3Q`=fjC^<UYCZ~V$0eDfna@y!pf
z5PE?W-2P^`7}7cfJ!5PNGq;`6JxSdZnwJP_RfL1wSgxnKQ%FlULRkTiK&j)@?()OS
zkkU|h=`7(^Xm=-9?W`1@pl38Z=5neziGW4`QwEfZa-*@ZdQnuAmZ74ooWIT$2KbB$
z4648FT;5(u7x-!l(5oR>>KU@s`TW>sWw=S`)doUvO5v$vek6e!Ul?VCRuvUnjfKGD
zv{tYt>}pKl8ETeJWvh_BhTiCp!xe2oohM8s$xuH?00!d(Ekk@?giz!U1w(iT)u7YG
zaFww2cr^9G$9X?J$S^xZS2L54*NK?lft1gMq}PG4*9O1e25*?4jB<NZ7|w72o?HuD
z-R-as51^g^Y#bValObwGpd-w?G;L0ymp>}K4B2`ZqW0#S(MRwNayrOxwY{?qO_QSt
z?plNG$U)Q{+Yj5Zod_RTMP_mm?ft{(85%<;RbXbY4~Ng5DLhwtr#aQEpm(Ez2UmP=
zQ|Za{qW|Sz{-ps9xj>>WUm8AhU|_&V6D5#nUTNE<v;-XOE1->yjTz@DhRor^hi~+{
z(0&3_7o!6u8&&xE_~VZo{b!Z_-Gr4~{c{z@76Vz=FWyJ6)cu@4f8I!taPi_r9655t
z$o&_MMsfD++07#h_aso6P&2c{$V@e7!q2>S6LRMAE&I1NegFLKjfPeHyL|gk@ZW#)
zb@;7cz8zovl@s{wPaMLRKfM=U`RrcKAHdf>b^zb}_&)r>ukOPieQXa(T)qU=SRqZj
z-5sMkQ>ymBjf9>BMMp?b1ywTqD|REC74WF?Y7`a{c#TdE&*evuiYnfmMmi_J6<}wy
zyHoj{YG*lBvlNB4Dx2}B3t2URwqtuCpPi;f(9?rjsz%2GA4A&G%1V@SfB6pnitnhc
zL8(#^SsJjNDo?rMYJCBt49^}*!Wt&D5^0n%Of92|s}%HZxPYy2oK7`c!|=Vv&amA<
zmCNwHir+<@uZibqAvBv%?N7jx$f3R|i`p<1ay&yoHKWlJfrE-rWtT3WJjfrE5Imgs
z`NQz5p>ZI}&^=7|A{y`@Nzk_h{AdpOkcjvZjnU&lG$&(-MuJEt;zm0AU^Ipxm0c>=
zjuau@$xya07BfKCmukUaDvg1b7W5O6eXaC-5`*nrr@_K|x)5EQN9g2n)So+xZAT8l
zdSo90dv_u}yMWa6Jerr65no$GczqVT?z_|Y4ZTrpLda=(GXNC{jJaY{`JDbnreeAo
z$TTl#yyPV>d6wc*XsWC$rl~=(z-TH*?fbw358MEcX*%DhKJ_W1T+Rfe_BRs<2u4zj
z0<G>%=hSuqQ)VhlnUtzbp{qVi@ix=UrZ_U4+k~BLYf~59n_Q;-bUsrB79V%Zei+j;
zrRp`6IP359SNNMhcqjhkOON7@zj!yk@#|;sJHLGv-~8$s{O*^|;&*@T6n_6R$MA=r
zK8*kVtNT%+ab@M^fZ76{%IVB;X3Zui3STQ~ZJXX`cqQkn=+y+ELRy=XiYCmLm7!L1
zCP%ob`xwJmDLhqB5OPv>#)xEuo-!G$K-x-Grk)aIgq?t=E@FaSg#kS(J_C5lkU|##
z2|ObQo?P9G2)y#TI+WJd8fAOiREc4=(d{T_$Z4UvtM^io1!CAHU^g;MC!lL@ctPRa
z03Kmq;|NkY@;E}ShCc{3eyYY`3YApEwNb*ZxgFKXJgTB;*pjUV_!@k1IQ&g;QWZM=
zQMd>=xjPtxHx!32+{92nVTAiZDhdry(-aINMHQN%YD*DhafbKt1OXTDAsz~#J()y4
z97RX03GIYlC&8AB$BeSOzLdh)6#CLF7)fO?%8+*`m%%`5nxQH|*p)+XXFD=fFp>2&
z1gUIlZ@+?)BgbK32y5TH8~)Xui0#>j;GunRAKVGg!8NQtdat4EjOR)3Cz+**%JvG4
zR~Y&kNgSV1H|p_F3^L`!+p2U-XbB!>{yTw9J!VvXr}YAYx`q{p!Ge}q#%A`;D2B`v
zPdsr0wz`LDWqzjVe1e|3qG_A<QOK-&H34LT$%LF)MrkgafHIf$nM|Om>`+;q)s0PQ
zv}9Z1r+ZL(Ex}n~d@<Z_*$;2}tC_AQEC2IrZ^K`H;R*chZ#{~?`t9rZ%ip_(zxr?2
z@xOlWKK#|!uj4O%`!fFgE9dcNpF4pPQ_&UDv_Av7LX(qc%Iqp?ilA3RaMY+GKxJZf
zG}t_F5h|f*63KLKv)x@M;a1e{uC0(XAlI5R2Dpy$#dT8M>47X0%=nO(ZMzK>%2uow
zf#Hwfr><f~!T^=;$qC2uDnf5FA*gcWRTFedxLHMoSJ^;-5_sFH>rhFcRZzuMGwiJ5
zJ(e>hFH@~D=W7}nh8ymrE2I_h6w)%BH-HyHjcTq5y7~~q+HeNdAu2^duc-}HEgdLt
zZbMltgStd3tW7yX4K@ana0HWZhNz%JO>hx(Zh}tk;rHW>H5;A!y;Q;8rj(HsH5o}G
z70DpM@HrB1L6}M|-okZMaBayH+6cWaD!tZd80qE+(&-3t*(loc33Rlk(8KR@tT}_}
zmORFDISjRB2)r!%dRo!b*NrSyS9sShcut+8(z_GoCoiG#%mvi$KM4EbBM9Gq9?rAJ
zVLi4N-b1^v^57=)^hD`h=TyI|3MN#wabqRTgo4zO2`Ra<oS0UuYy=B|L+ZYGfA!Zj
zP5pNTC;?H^bUJN7YOB&SL8LJ*w4bTuRD*2ptK5A0Q8iVgLfvB9dT+(c<~fwQN(orM
z@r`fbBOm#Qk)2tgyV<W-=M}gW9;+X)QejC=>O0VJ3e(N&D}4UI2R>k&_m=%|rYBEN
zp^BW1F5v(4H~8j<UXHK5<97V{hp*u8K6f|%#}_Z*ufKX1{_YzO;O~CtA^gqn-iQD7
zyLaQSzHt>L#jq;|o>3cn19%0{J>yh)8o;K$k+5_bAzYL%a9ZP3Xncxxc_H1C!q}`r
ztd1Nq?ODWH;#56e)Y}+>Zl-@yCKP2rF%;f5Dm;4S_8rE6JX$a48D3G2^2&mnLo8~z
z!>XDpo~y=i!LOPju6kwI+&*J`ScS56RH2nrbQX62<rX`^N60bMt=;rGyBl?cVZ8<=
zVu)Yoh@#dRM`I`lTdW-o@itTwPNh_G)eLQGGM%U-_{!orL-jQ@<xxxEHHMnuh!A?w
z1Y8X9-BfUzx(GX4QwuE3S=6QSs86@Ul4^&&xsw3wLYUJi&lBbPXs!d{TpQw<R;2j7
zwly^~bj_eMnLvu)U#6o4Ep16E=@g-tCiK!6PG>QZ&0~x|DuUi{cPsjP+R!`Di!MSS
zy|#hq{-f|6zX0!<D+myJ4g09>81g3Xz61V?r{O%h7x9C8u;>2!jM~~8(4*<~7x<|0
zIQMKg2o_RLCaCmstJc-rR%`;5pqI^Njs4_OX99&x=aq8Odb4suWxmDN(>cs#olA)c
zbZzr|QkkaWQVzXB(MYLrJ~fMy72lW6r)6{9)~N;|((y`+s&ff;YD0SR<Vn2fMK3bO
z_As-$s7s$JO>`}(S!Ih>Ga2o3=bd*N3Rb|pWk0;>uVO&YT=>Vo$LD|YVSMP$MSSwX
z1^m}n?Zuz|{2Bb!hcDyrKY1Vi?-w4%|M}{}_`BbK6o2<e4^!#cyi{r$mx92Px>Lhq
zdbOZO;8oSy=>=t1R#}ho>UyJLZ%u;@c8?F?c%l%>5_Sq@)m_Y}jqUD2rY(n5K8+k-
zz;t&Oo&>?;sz!ya66N&_XRFJwy=)t{@3;-6JCw0lVJ&?-LAPxOs!PiVLRDxGf~wpg
z>?$hDP*q*d{Y%hbA>?WZz^YQz)>omys-%>+!DiDK5mv6NgUura*GS0OsPwFa8C9Lj
zhYEL?9z}&CfJy?e&f!6$%MY8BZ$l7`HjUkpg2k1E#ZRDxvZ#rrQBU<{C0y#739gn-
zR1;j~RC|>nD#CDzfNX}HVX8F}Ak1P$<KafXAC5TBmFd8?j(&JI4x)Z!3D(h7)Q+y8
zyn6yQt-Wx!_rOosg<9L;YwLnP--!eP*p}}?AE!OJcBJ_|C)*gZs<ykW4SiJAgY@1G
zsyhC#40L2M(nZDB-G<Sg4h#+SU}$6*gJV<ZnOQ`3V;3@q4x;72J|y<<M&!V5#E<Pq
z@;KKW+Kcv$bsWF*n(-_dz}pmcn&=@bV4z^da1iuN;K`+OOeH2O4hsb=6MAMCE9hw-
znF&3)U?xy)4I7J1nOe+!HB~!2^IYbrYv%dXH0ob#f}W058mn{X&KYHn`V7V3RY-2$
z!_d$WKJkf980of(O`*8}u4nU>{qUxzPT#!TP<sFHkNCzbUWShy*^7@KS;ePMt>80f
zm+<Ke^Z4RD>-e2loxq>I?=Jk0U%MZF@#T9^qS3|*<H|k*yRE=enO$W~17Sx6WoaY~
zoN)U52q&5t$}()_gCy8#d<wmYf?hh8L0dP~9@R!PBgJG#jkgZvu1f4^WVmW!s9LQI
z##9HTw=u+}!Xw}WIYF;#M;SC?m}+sQ_G+psP+e63T(wfITcq#^Lwcj5miJ$dn%Yuh
zeQj+yYU>#OR+aIy)uGl}k4Co>4NeE|UCmV(nycJS-RayR)DwbrE;j*3@VN;*wNG?5
z!QyP@HiFKVK}{fq>Tm+p@i^)eN!XHkI8vRkwsbMvZABf!Q)@J1z|GEm8sj`q#0P&o
zf=G&?tU_Ui#ib+jXuf<us+SJIckC)UpZ6-1Z5)Pg|1mh{7Gdq_M|GCx>luW7a2$dD
zQAFB$jPZOs2*oxo=lVyG@9RfbZy$y_d+FWi>hD75KnMEiBLkh780f}$e>X-4doeLQ
zh}pprPDjx@JB7}bIdrWpqK%&3xrFS#HMH*EK<BPC^e-*q+?6{DPgW81%td2HuE`I8
zo|KPif<iF>l^aiCpP6M@sz_OvO;dPlo=2`m$v0~XV%p81X97v*HOutm|5E6gYEsjC
z?zzVZU)9!6ruF*FCRk+}4@93|jj~nNx3$v4G=;#op!cJho;rQ=h3An=|M&mmk6!*0
z_`;FX_`;zh_{!14_}spo_~PMR_~P*meDdTxK5%XnAG&t}UwF+fl&Ec5jk<{$UJO0E
zq3#sMsu6E>y_I3G!)SLGWXRT(VmO=6p}niK(Cmb;)32DIXLKK9n3bl=i>KnS`xzor
z-Br7&%sjQ&R<{jh1XNko4(upXNOT*@jf9?skXBw(R*t#~f{x*@#*eM5sYDGGo}gEy
zV{1wnhEn}mwo!F%M}vd=a^C8$W|&$5ht~qT%L0e9fr_x!XbPht5FL$nxEo#Y@H6@d
zdk@1}hnwK{5Qbjfr_YVbfFIStFsl6#)cWJ7^CygHtv`m^U<{T}6YNxWu4s;cYa{G(
zXbfi=)@B*jw!j`?ILlDf6_3Faj=~p7aa$gZ*<sjb_n>aq36u=Zpk#GF><>JN`nw)N
z<i1DYI(Y#lBU7;MJOJx~6R2F-gPO?&xVSFaKa6x&51PAs(L&&5hx*Y*?;Yqz-#{Na
zhkMa6+KZmCKJ?QEhkGzQ)PvFCK1_}bV2<FM9vQ^Y<PiF2Mld`#fx-DHbT3b%dwm`~
z8;j^&oyW++EG}NT%!i4Olt5+tl&S)w@UT5I!^sbTo<ca4rz!pW*S_{OV_s@b2}9K!
zMf;c)AL<cdR#XUX%C~2pM=t0UL!{VrZ4LaR{Y>a7ye)>_kD~MhwnrX$#5h*@{A9%t
zGl4GXX`kY<OqsTsWrNzcx3|~0pIi2$nUP#Y58`+ET>Smt;Ll(4TKx98+ws-Yr}1A;
zoxpD}n)%AX{q()~&0~A;m6PlE;;BV^?({TD3e8Ol;47#-K~E}A>dtB?JgGaYgMbKx
zHX9kYA}dALUg#@pb|Etpe9F`57Pvf4DlMuoYYi%`6{vAkqt;!E(z+cClNq*FY)2`f
zr@?qC7~U43SMb{M!cd7uLeEMnxLQF^p)Hp!1fR1M{xrdrWY}#fK}BN;D(xkx_LicQ
z(;W>Z3|T9vC~Dwptb>=ouC6)@JPgadRC>|6MuZurdo32Y9F1rs@M?W_g3f{JfD3gh
zcMSL#+WOHL2s1>Dp@ClSZ$d*b$>k>4g3Yi|;WdUcgkT1)a1I`V&qc_&6I6Wsaqz@a
z2xq%tO?03tH;jhmgM{4yl+3SV`|Wq3_MsP`@v)a;+Zk0bTt><AKGdGM1Md4@fcne#
zq2bth`1Tz}YH=B9zRanC0c3}V&@$AE9D&v~Jb<2|0dx~+J>$dZnJ9QaeQ0tRW1LRX
z=O#xmGsWfEA^I?;r@1^kj`77wjI2y!SiUrcsl{pBapxsIP<*jByQ1kgRoFA1v%*XP
zIOcRK^i=j`HWSfQs!oba;jH#E!&;SpX)qr%d=(f}&~Khcu4n7Hl-|vt_k+S){UDq7
z{e!Hy(nQT=^cj={)x6KGl`5*C)n}E`lL?IGJ(#9rr1aE8=a&5_rm|&yxBMG?@BdKg
z{ZIVm)33wt+<6y%|LPU|{_VHpk1t)pADlUluOB^uuO2*v|9W5_zJ72wzIkY)7<z>^
zcf|^?nqZK+E3~_dBbm++b_`iN)rG9k>_nlg6rTFZwzm>?sTRbdQ8*oTJ_J-`<=c6l
z3hb!bhHaGulZ7vULR$i_ylk5>2u}fc+^<X-i&d5<=oN-aG!lAJ-D%{oni_6nI9p@m
zwrDkK;uVN?dtr%`5mdJ!*y%)MD1gSi1$D`4I1(0yvt@8us}Qy|BEruSusYzWtw)&3
zFHo;kPj&FvEO0v;;dI&Ha@*mh>hrp7@Nn7f;<DEVH&vgSqgaC}G(^&9h$hewr#H2r
zF_wXou=6T2O|%uhXcoRi4(=qCUvnPTOeabSzUrA3SWZ#R?LL6pj-A1_(}djld(imE
zE6{NLg(x|25!+ASgOaoNq2$_=sD1PmaNPF<>JA=<b$lEl!mXK#t$CoA0PID6tRLN@
z!{{Rn2S>&*G&YI;$tm<OwCtaq!Qk8sM(3t6MpZYyFov<EQH-pPVq$p|OEaTbpPj(c
z!X)OFCo#1=fob~u(j@M>OT#GY<(BK^RzuLu@bq5@y<#^No!ks*O;>H^)@B^V&=bh?
zd~Ge$D+bp;SEg6oJ?>ws^mKkx(J8#u@ygOHEACZc-p3cd@C9QZv$>2+_fQ<xYhT^l
zE$IDdrk~$J3q3s(gx){?75?lEKZS4Ja~*$p{TlxJU3cO4&R)PDp1+Jgyl@A8|I8))
z&XF_t`u?N%uX`yt3b1>ovZBxwrI8S{8R|~GDq=~6vV<E$mO}TjcB2`JnaNlWQirk`
zH!Ha(6;f>-VOEMA+cf|W;YPsSwxa~w%BTPcztV~l19%nmN~%1KPa)XVl$nEKYf8{n
zm7`Xb6jX(EbzEP&19en+7H<{oZ8nr8OW|m(M@5K=p=ldDeT}GSEk}7PVK^H_YC_FL
zYT>q5!B52*vRDzeyXZ~?3A>2hf-u8hrFxPYlzVJ-T&_dNQHP+t76Ciq=v2mHKj9ZM
zy6ZU@+S&-b#zYgHYA;Sz8_f`QIrt+vhO;?1!&G|&o+H(Y%481pgX3tudJVlVc{!Zd
zt|M^YLsWIUP_nQW?#Ew^=nG!Y<>OR;kHGfC>rngD8{vA@&%*osSE1qL?Xb?wB0M&N
z%=jpBqf~mMeaJIZ?U)!v=hzs!#~7+k&7*yO8JUG8v@9+mx3Y|ml|_cIbLgdt8(f{j
z@WvD-*QX5FZ7fV<h2iZ2p*Jhwtq_7Mlel(G&{H@mb!9?N5aA21cYKpK<_oXsR_Ljm
z&g@Ykmr~MH02F-Ws{K@>U@0KML6DFuj5Lq?=b)$Q&7fy0v8~Yi*VLSv>QVz6>3DN2
z57TrF6RuzW@|TUzFQsSRtNEF9kD3=mH4wV)=a&8OroX(tb*00A8dv?(U*nH|`ls;K
zyRPB)@4OR#LdEyj*YC$)-Ej~8^x`%A(YZVDho`UL4^Lgf?;Sgb5_P{ap(j_DEK8$n
zv+S<PNZl@VmnYa-J3BaSFZ8I`EWgtOscQOYIKnWIA#!yMO1JMY`pjwsGF4FQ*v^NY
zVQhIN!`jO2*v4gJ)G-3lNXstJN!eADqrS4jfSpvHhD(&ftFJGIh3ZhiYp_)^Y_y=U
z%LYrY1vS|+Y>$+{)>Vz*tOwO2weYTmP(4_W;A8;yCaO7qErJd%TN?;52j>Ysn+-9-
zEJ8)+cUs_cHy{{jM8H=Mx1$_kFOPBZoD7e{4l5%30f;z)^caHf7(Br+9FZU#Qngfl
z?q~|0Xqw?|n)7M+2|Q;L=hJPlW!h1j?}BA$0=DH1I1U~`+hfmz_4X^Mx^NAaYmec!
z)7Mb4bO<G<ufzM2HxPJlL*rAgrviKemLq53nqNeiFLQJM0MY~fNR1DmWqKId>2b7B
z*)`8Dpm}{aV*3vvaQG;k2M@x3<Ot$N4xwe=9<=RTMeq6&hIcMuV&@`e))%p`v4pve
z1<bC`Vs3o~3-sl+X<Waq@K(?>l^$PQBg8F~@9{0>v;Z7)x)pjVhf_GHnpm}M)3A+Z
zm}!Q#YX4^TgeWd2(=d+adE~0;eXc&Mg2~(7_O=`Q3T!IVyBYN4dS;8&*FR^vFNL?d
zXN{s}Laz9JWuN`*XN~ilgCgDR8d@jlX>_(*_9L1eJ>zlY-}xW^4*%`7KZP%!yNs`$
zyB&XY;S&D*{3ZO&<-72|@3<HL>*{s<$;CVH-_Bj4>bo5!b?R@b@K(Jl2t1pkFxG4=
zkwQy0ZzT3q`X_<T>^^3c<!zST33y7;Q^yC|Xk^R>-{?WH3BJ-D+YQ%ni)DnI;8$6(
z!x(Z=l@uzsQ-<RDN&=2<sjejK3R#OAYOByt&*cVsV<qYu%3yQWz|(ApwW|@9z6Mlh
zc3>N!x2>rJj<E*Vmt64fjleQ(Cj>*Ni<H9UtwhLeLDc0iMiBGZ>{NIzsyRD61fI)B
zRoCJ~vO9=Ss~wHea=2sF1Y0H59aW;6%FoqkBnmJR2DrTNd))loUSn8Ae>lppHVHq$
z=c9XLE%cPpHqW2V!=B8bA(ex*wF^}_0<dog6(i%=HZcqLasClj52EJq1vH+&8x<!m
zqxSNHu-y9;Z1=qgj*Iuhy6XsBlQT$+Q7w*7qLs=nJxSorjw3fch1B#M;)H3OVQ_?h
zSl{J4;kx4*ym#M&@U^=UyZv^w96g5o{{3ht06X{YMDKy!7&^Ec!~54Uws!^7dzP@c
zcNr_Y7I6K(t9+WC0iK?&0`zV?Y4jfqJ+qyjX?o!g62#^{f{1!Ys4UDh6I6nvsoLaf
zoUG;^&jk{to-+5<aq0qR!f9)no?yIHHQvljbtb$2{`-ylHU}0m)tk)RN1sF2HpAQE
z@5bCm^Xh-BvdvreBbfOZ@=t7h&s5~U_sZAd*N&dUSB{>-@0~h}|90#o{`~Yg{MCia
z_@7tqz+X`P{n;IN;ZL|+Vo^2}n;UkeZYS(QktQ@H)5zomw?ZRhWkN~ksQ5OM1gI=e
zzrISrqnvUorzx+fG+f=pwi9?pKiN$e3<bh+qoHvrp;w9;D!RHVg08v>mYV7tuCZVn
zYB{Z|K_f#_tCh=EZfhh6trf6)>flSc5Eu%;)?<aYrxC8U8q{UB!#CkTXjcTa(^kIZ
zIqV=1>%u$W_Eo{l<D)J+d{lSJdZKY*eT1OJT?2dCin{hXBzGkdS`EQ5Wk+hQ2{mad
z&}b>#$$EybwN!BofkQR$`s)$(*wN&2BgXTDjqznQh))#3P#nQ<6T<3s5s4#^B=}mW
z08<1ZRkJUXLxA85<htR>^}*3Kfrj20)bvlmxv&$yJx5T#wjYke1l1jnB6j==>@#Z!
zPt77VNx%({AwMz2Fn5I0adc12AvZx4IJbewp)&}byM~4{cc6@tKJesA;Cu2ZgdTeW
zP1o*&@Azp%E?hv%ombF&@hmduPonkIA@m&Ihtb2kFiik1?pek42d?v(Dg+RPC+rzF
z|H({9>7`b8ui~;hLC=JX2_W@W$Ye4%_E89|A5FDqGr=GQrBJaLdV-dELzt$0i$St?
z@7^2xE2KVp^ym#Rsi%mUlt7?W#fi>k-n)5k=Cbh;ZJIeRrKjVi^oq9^+vh&_IpZFb
zGw=tQYHx2Ty&uu^C(xgZkAU$M{L{Da`>%REK6mT_K7Z&KzO0IiLkIDVg9q?ChY#To
zPMyS`-f;zgeb06L->Y|{#4cs$2_T@dyLb{U>6|f^Y^(asDy&u69U<4z)oFN^0Bll%
z0I%1`q@k{1w{6>w5`ykFD!c7U2vAWCrGQe>Po;J*FR$doSZU<OtL3j;Egw816+5R&
zA!Xr1EpK2LVt9Q$8Z4YP(5+nGNccJGV6j!h>Z*e?W`i|lK|`hvv0e|dBOx>my3sTh
zKxiR|`jG}Cb~d9bT>)F95`jP+BCZA`eQqi~0?*?`)a`@I-T-&PhSC_rty~G}=j#zU
z8s@YX*(1#;<GO}?IZ_ir^sQ%MB?POIRFtU-M0{2>6P^jD8*!HhF^?Zn{%Az}VMGYO
zus?!Okkh6FBFYb#N^#m^q{nYcv>?UsH=bxiEH{8q`!HPXeQ@;+z&$#OhM^JICKnM|
zJH*ek3*M1wBnUi(v8j;><f-~{1YYOVH2NkM&^EP-@alf}j+}#)O0nedc~o3`2=15t
z1R7rQ3N$|dMeyG9C~Rl1!h7vrL>_(^p$G0k^1<uKUb}?0^T*M9bT1|j?Z(3XUAXJP
zdyRnh#_L`5o);9K9yM+!08D87#b5lz4e+Q<TXEA30jgNV3627u_LVCMsLo|JOHq~-
z^~pB3%LPjf`du7WsyWKFYuAig(_%=fp|ELEh~{{?<~3Cgt>!aknVs%WSz^?*MwKWk
zQ`F}&KbsmS3%puy4kly*-TV&JWJTv!c&l|L9CZy7(mF=9x3|jlenit(%STYHsg&89
zUo8IMl~3b~$1mbbhfm;3`}X6v4(`WS4;Ng`R=#xf0KR$Z6#n-7Rh0MxVMJpIB%521
z&8c#t9U6H|NdgLeW;X}vp{KJMDw!}E86Ik6u^m)#8f~nQ^`sglrR6A5lF#xgBZYgZ
zfTz*Q%Bu|k*3{JV53ZE#obn;6=O5o<0FD5&G!SyiPHZ(AZ(5a=*k+}=t0(XpU~@O3
z!DWHPS&v3v1FWHX)cVR{^H)%TRm0m<i*SbnshK$PYgq&aeXul9;e`o2uLTK)r)h?x
zQHHf~x7UDG(AfY-m<lysg3yW+fjta&ch;d|s08_a0aT|;P(#Rhy33JX^`pGA1eTd{
z#8-V(arH=eHPE&TO>TnE?Lpk@N1Q((397zSAckhDK4nz#219U%LJWVSXilW)gdTrf
z;+Z^Rt=))s_aW5VX{3Mm_UGXn>VkK02*KeAga*eD9~nbtY69u;QKUx4k)4>PkD+~P
z3Y}xKNKdaIbnqyg7cQYn^8NJfsCe}Gu)X?qsC?-wQ1$o=VZZZHG@ibj(0c^o$6o-)
zqYoqeg2#}4<UZsspF`L2gBU-$5A%oi;;x79zY*9PukI%F3eQ^M-?zV0fRFid-z>Zp
z=!!#0z4(HbK&5@nvOWF1%}N3jj!!@Rv{Cs{3`c=SwY@4wRsvEbA~3hfbv{*M2s9>)
zG_Mp==Di8X0;JCE;*XZjA;2nP(Y2%$S65e!doWG+t8E|s=tno#e7|YnaGl=_Z*LYt
z>+>tTl~}lCKfG-+5C8N%jpM@S<)8i@zt11|FC96L-#&B_zp?iKKDWM(&+T5vXZCO4
z69;zU6MU{eM+NweeaBFeP|d8ys>lm)1<*;&$))hr@K_mA;)y1>JRa24)-!~yFvgda
zvMVL<)X%lXYQuIaxe|?8A<)%Wu%oITJ1Xl?R$Y$@jb64nCPg)&S6fH$)e~ZboV<a@
z8{X)E)h@Rab__r5gdf9FyUz(LLrM$bW%V?|5wyV-u)-f~L@3+{U&uljGGq>xqaj`c
zN0LAb)gs`xP~};WbUV<(`;YS6afYQ)yA4r4L*sZgLc?~dDi>^vm8hK|<W{N>T&O`s
zwghFV5;%v-;9n+am#bhuY=v{x21lGKlghD);EQ_*Qm>%rMH3aDpqKP0drA|cA%?n9
zhRsb>da-7t!YO())m{oQhP^StkKcW~BZEjs649<CQbTRXj0_?<Fo@><L1acpkR2aF
zdW`d9gdP=NZe{`<^Roos95RcmNSr-`=)KqBeeeNTspKqA5^S%24XR)AQdB<nBrJD4
zg!=RMz<T$?us`|)Y94+N&c`1@(*t)Sf8_%DP94G2u>)8<ycgFVeZVNu6V`4(ufVn&
z&@0k`23!O^LC36}Re`-}0$s5}6A)D6UmOk#WNM&nsw++R@85p|P$nQvH<g`v4nax9
zEx-Qjziu2~Y-$pt>zU^;kJT~7=P|dd=tkF<3skCb(Rx+Nn2<M3`$(ZG+ltP&yu5tl
zvznhp%jUfD_}vO`e^gVfLEIUcld0DK@&DlWU-?>m`NTzh?eH0Vb@xGhb<ci$anEji
zeD4~5dCw~T%bpGV>aN}R{K{UGWYw_PP;`YXD4MD_MSG{(;k6p-E)t8vMgVC*ZKZ49
zM&H4ZwPZUVa00M`>P@4JDYMCTLQNg})xlpK`^)MYQEg-~QF<z+q_V<D-f6L-PQYs<
z=xl@=UESN%OvdW;pn;0aO7&!=ifSaR>{MwsLZ;E{f-~TQ-Q$4A=Yq>^gOf_o>urQL
zV1p;@K%mJ@AT}TnvLhOB8fAIS9yeONKE!$6CITuJ;IY0sL_570Iob#BJnwnB67JnL
z1P{4TF~*mxqXc!6W$^8Cz`^y7!%n!)`cXMugT^>RVct)C(<6Ql!&)Cf8bF*OZrmG1
zJV1y=GYHVbO>Jn3wGq4-v<CBNj-{xs(`aeVp(U9{Hr>pZt%=`#9BrLxwD)!+*WHVJ
z{{ZsC1IPf6KybfM{pCjc2)S|O=`FKk$SzJHGc$$e#d#!79759_7ZADU4uq)GJdZpC
z-wU6Foo;#F<7jx`VbtCC5UQVe9Cc4Uf%?ZDg6o0nh~Ie$ofpqwoY0#)vKLE-cH*AL
z1wFNeD*U3}0G~0@zrxyL00=B*c4D~{kNKj@isz-~q>QwGG0b-D+I2%A>D-!s#VcN6
zjEQ0HBPAxd$jx<9ak`%R#LCQYve;Cip`IWre-`L&22m5BQh$QDOu%~Z!3T|PrW!S&
zR;=b!rl<Q*cx!%EGki8pa8%9mt?>3oH9mLW<9GaBe!t)Odw#Khhu`_h*W(Kp?!woP
zUclEk4&!&$58!LN_v3Si_Tf`hd7mNxKefIKzrMa7B}M~df==Gnp-@(3cv{Y(DcJ-!
z0a0hEH!3N%?_jv0@U^rs0$E8ZA3%n!<%C{Yy%o1n!QEEdU?k~m48~CF4xonct8)bi
zK|d-i4g%6nRj7oWHq<rRQ7_mLbQXr24K6~?<s;<$XmAs3{-EJDUjU9kkm|1BzNp&E
z`QZ;o;AhzA^ZOZg2dStSLI%BDcEL{|MmZlNjA8^{lgEoBp_kw^&UNvK7jA+vlJlZx
zsTIw0Q3PgOi0ur(I#Y|te%|w3HC!9K=c#g7*Ba4sdj_csX?|(9!5(cum?}@_j&mCI
zGjt9H=pmy_FCJ0(T?d-`#^GuofTwpHvB7!x6Maae`VmXz3xk^{a>ylf=t!l|t<>(h
zCUoYKXy=bndq)>K`uov7MEDJLqJ6Xny%Qto7^m`^8b)r80Gt^|W^M|JjU_}W<3mUH
zBXH^{{O3<2aODyL*RLUP-@OPv@F4t;KL*dso`Uz4FGA?4=OOmMJ;*S$?LT`QvnLK<
z`S32R9$dryPux%F898a69pY}z82>hbX97TwdFxx>YGgc7lM^-M)jsBSSuyyuO*!V&
z+(fSaw(oo2`wX>b0!#Y{kgAMOS>Wqm|9ZUm#V<CLm!@h4qc(b9|N7UB{bf4Fyf>M^
zR16>G?bDz{8c#zRQeOJfml|Mw*~?yLgw!h2`-4CD1LGX#edw5XzVn?%S);kG7^?b=
zYI>v3|C!Hx##m==FaFH3kAM8*c=x;Cy}5MDemLXbSNR;j%V+C*{NZ7E``I`AG~R#p
zPW<}G+wt|oXYt$1d+{5qy9m1teCptCs=mGW;>KS58lBK9<hT>$1iMVG1&QW3Rgxc#
z3@w$nZhPqt!mjlDpjQ}Y)<z}gKpCe3-ZrYZ9U3?%pwY)-M#kbghI>_Z4=Ng*RDmwk
z*xaZk?CKb5TBy7PJ3-D$*fn|uxiCZ35S%>T5sVt{3di7zMBygzeDNea(FA<4B)rk4
z&Hk-1qt}QA0}l#3^caB{3HYhV2qYg>7^iVA3xol$4e_WKL7&wKf#XRBd`(n-T@JXq
z>*1SnAh|b&-izI6oN7QtS1FnnLq=tZGgSv)kSdZt2nni?W-5b(ac)8{5a#vc1algZ
z-ce)@9!34y2I`L-MdaFbRIlxWb7=!!hJRj$QK3W@nU*%RQuTGDn$exr0Nl;!$hV`T
zvj-i0y=WikM#oSGx(U94fnE#^4Pa<$4E^+x*=Y<f%%O+gv9f^HovX<451rn<3oZOZ
z=Z_pm{^VI?FI-0Mp8L@9*rP~2|8b-rdl>n9??TUo)0jMc1Z&6lVb75b>^{7T2cEcx
zLBqFhsJ%_rZJjZ!7!IZhG{x)m;+r7Swyj{2nPqvV{d3#5n!vVopJL$3%xlYv_tm{_
zy<eHRu6SQ<E8bVvF!wJ8oH=h^Py6Y(t)J_b{c!f}e=Ph>)NKCS{P%$m{XE|A)T4OM
zU3cKuE?mNYJ$?pX-m?!MUSGh6S7!0~g++X2aTUM5u@fbQa5ig%vYIMuubUxcb!|1u
zDhN1Ac2!uY98Wc}h47a1l{KiKx~nGOl&JGIhOH%aR+Krss3Gtw2`yWq1+G+<0F0u%
z(M{mFjWH+c?aHS|*lnu2Mi14UmkLY3^G9GK<m8S>!f-pm>5exW?h@#lQdCS?!Y@rw
zF?>v9IL~$59-^xZEkY<pB7|Tlf(Shv3?m#Mv<O9kFc_fvixPk;>mvl4JT`>f^++U~
zaD*ygYpz0Lo?&c9E$Y(cXh>9`bDZ}OsiC^GAmnG*OyCK6NdtO5G*Okt{eo@+&O{#W
z{z*jk9z)6S6iW6UK;WsD!gBY6NZk7{n)dF4Z*T~%jzL8FxL?;0;vGH6bSv>{JG$Dt
z3~%jdLt9TfdImbs*Vl%j-cF1U^<#Lb5B-C^7#kbG<kTcaCdLWCDU7ZyV30ntXB`6@
zYn<-Iz@Gi++J6vTr%oY%@tmRV+OA$eFQ;SYj$!`f0qi`!7Y9!6!QNvVxc_-S7<&3A
z6GTiiU*=*UXkGD3Z<@JJF#t?RnCCS2*$S?kL8BN-=JDqFbe>`u>3q+=?lU0LzR%p7
zF?p7m+iqsX--UV4#pf%wn|+4=*G!*Ku1_yFmregsP3O?J{5RkI7XF%7c>gzl4X=CO
z&)_Fs`Y7If?Fv45@d`eF`Yb+q@+dxYcprXaWgTCdUBD-o76iQ<L)tcE^I5_o0*kc)
zJIcxoH`SenUerirWmI?N<yC}X4Z~B1q1YPSK~zw^RWpPwX9!zKm9~u`Y86#lO{58R
zgil?FYR?lgfM;p+!fNxvM&)O7XjnxJ0Pcg0(383o>>7EjjmJ3&Hya^mBgp)jHrNR^
zXM&(hwIa}zhb!2OSWBldA4ulmCHz9^R``=?gqo?Ml3Z?TK`=%bh7tr_oZur2c|YM8
z6<NrOCMvWfL)285iqK<6bJ)wUwUG*tp>Nnu#cPGvO9kj{K-_2JdTt9i5LWo?^B5ju
zXdCnUxIKtwU!3Ppqdw7w=H-L14Xk0u;09_=-3|9+uSEUbFGS<H`w-iI0=50)@a#E>
z>|M`8$?R@ecOF5@fg^~IjH08X6aDRN=<X@_KzBQayE`!4+l}G=9t;k45qSNW933JA
z2QW7|j=9-MjLwZ<e2vqMX^icj$N0`AOi}sG9@>ZT!+SA!d^d*9?8n&YLzq5u7;~o&
zVCnQetex72UG$wN_Tc^}A2h;SJygO)@xxkZuwr8G87tnW*q>!LUHR{2&v{pyc9U&4
z-TkI3|J7!m<=^IP|G~E9gJ>eq>G{#mkzDiU^JGHsS;yVXpFLYJE@=55cosDt`rn-X
z1*dQM(=X!%zx5$J|5x9Er+)fnc*EV-@qzPK@hfL<$ES~-#OGIc<4XkJFV9XCpqFSS
zVB9WLRuO3B6-INELMW@o#T6)}vMXaqtFdP*jOHgbD6eTid7TZlRBh!NP>1lV@e!I-
zS%O`CB1Lu7f(C+1shH}$8b4Pdvk!K<Q=^8t{YG~;Ct+dthm0^*s!jv>)DwJm9`DJt
zBHlNI)~R`9#-`!Uw!=z=7tZ$~-ZlVNJcnp&AK{pWn=o`Y=iqH208=^mIPXhl;inq(
zb9*2{7)A+0&impq1fn4%!V$uXiZ2vEz~e%b2I*m#swOB&flRfsE*Dy;NSj@1=jTMo
zr?R|4ZEYk#5BU(|F$pT%lt01q=TP50gz%nYsO()r<-`F*?tdwETzUl7M_z@RbN9o!
zd;m4WbFdt^fLcOu$GLmqy7N&451m4Mau!`(-56@mqo;@7+lrx{4vh44VYIIYBUE_9
zqf~o@-qhq6rs-1?Bbc5Z#RQ=@u{es!)d@_mPh)O<7K^(Uv9f;+3kOy)b9fo^Cw5|y
z@LM^t4=bnkV)gW1Y@FSXU1tvxd<XF03m)ZzdZV}P7Jo+R72EgkSMX=qb8h(euq_w=
zzWdm+U3QaIH(BvRS@EfV^t1i@+q3TbCLcs#)AO?xcF(2yisxm;+Y9TTS<*kXBUL1#
z`rALJ|IA<gHXi-MPvEit_N#c&mp_2lzw2l4%KNV4EqC9EUpjpnpF4OQUs~IPFYMlh
zk|-6A)9JWTlO``K(@0cG<F2$ogkVJ#Dl4l|RaIknWmT<_LAjhEtx~a<)-@X0hpPy(
zT7s*ZFTW+8L<8L-05ZH)WroE;MMv;CIPW0v90u@02Jq}W#wO5F;VJvDlS<7-Rp-vN
zqiJ9iEd*Z6_zY}Re2#n<BEypi4Nbz|H-gCEIQ-oM@U(TKk?PJx=(z|zcPbB$obWT8
zYEbUgI`x!DB#cy9k!TZQi6nxN2m-!Bxm;LNhO0pjm7gj<ynaO8UV>4|E`VmI4@rK8
zFku-C6OzFYntA`paI<mE<k^#O-F_BjJNKdDz)4hGyoM42&Hvgr!12KIP%%7@`hhvP
zj$Fo$!<V`4Da0OnA-t>m5$NbcZx`X#*~XBz72W-u_IF~izZ+vzd832<g>+J*!x)@W
zSUZWq=`pIjDNHZVV{T~<iwkpDU8FC~V0Co{>w5*hMJyd$!YV`9)uX$xae}ZT^mY<@
zd#L{QpF4#8rw`(h7d~b@RE39)_RNz<d)C6g-hRZx|IO_C?n4VVG{`e{)BU#W^Fx?<
zmLL9XKl&MdSku)^sNJ~oKX)tUL7QK30bFYQREQn)Ps_!w^~LB^VM`&&0N4E!owMKh
z-gogoxaIYK@HyP`TOYvvfA}#x@<$)T<G=M@yy2@K!b7iq5U+jk3O;(`G`_NN2wz#<
zgA$v~ikg~gW4sBqY&RQ?YMfblB~@Qll^Pb;qPDgUb#-cEXEAc@sRp*1US4lSxrI}!
z1Lbx%%IH-DTD3QbT6&Gkk6MS<fS%nIG=@mDdjdwS><!?l9FJkA2|W+PRBKZjiNP^s
zrx(yXK8rx_FsxL15yCINv<Ca=1hTvL!8J67+{#Wk1U&-BL3h&KEmVF~f11}iJD2Tg
zhRJjn6{NE{Ll|a_KC%HqGMLK36G^~NwHl<_RKf+H8W&TE26>-;ekLF9InH}+=I3bU
z=g{zszDUR@H;nQAW6=~GEj-Vjb<`c&jlh|+Xu5hgs*jyU#o3E!y#Ha8oj3!_(gy6y
zd*C>I6(vW`qvqa6;JfExB=;UghA{7?(i`sDgx+8qx&}MZH$)IB{rk`$`sNqVziT(T
zHr9|^TS9v0GBUfC(YA91-7AZjT3W#heQj|bYs<4(-MwHyZ<)S+cnuq>v^c&8dl=5{
zBlr#wdWX**!XZNMd4!%GLc@w5X2bQwYUV$=nV-in&6scG=38mnvp#C^e$U$QuQnZB
z?EgA5k1D>GfBjj0SkskE6y5ZjFmK0n^EP#m{JRC@31BKj5zOS8*E($x_6o5^0eIgB
zy>IcvDn{jb|Av#-eEpL+{n@wU<X3+Yr+@Fgxbshc84vyOhwzfG{Suz|nJ4hmcU`~-
z4<Ey??>>SOOMNXWD;UP^P&O2*ywV-UAUsljrmH2q+7_uLxmvEPM@B=V9gPl`(YDS)
zS8Z&q-Gv$gui8=Y8mET}&x<-Ifkzc+CH(9jLeCc_@Pe=fRE8IZJrsq5VXc#}am7={
z__TI8)mi7lIx^EsNKP)m(=&{~z&O%tyV11w0DS8^k=%Cxo~c=6xUN3a0av~U&X#t<
zu8pB?D_nG?ce1s#p)u1AOSS`+9G&}GvxFqQp8M1{<p{-gLX-OwiUu?{f#{AT4H&wq
z<XlGgvbfO>FGkmpmqv|pR5i9uMrDe#IgRQ}7M7kq*t-Vd>YIRld>+k*Ps6>F_kH99
ztVa*Qea9tK-*E>@u3kmy!w;b9;_dJ)uA!xW1l`?T80aoQuSd`u?nL)+HwK6M&^<oF
z?_eDr=Pn?2^cY-655s%s?Qq|H0q(0O5j%ej?fds)l0OJbs|;gT*Ri^HCzcPaQRyvX
z?Z_&29bU)&qq}k7*e)DCwFgJf?8C7$2XO4vK|KEjj};y)K4$vQcwle1)@kKUOsk9i
z$Jw5BkI#A|>CbiZT5s<AgZBGD_N)v4E6hC3bmNav<mUSS|4moXH?WDHn|~AL?U??o
z+tms|tpKuOusw5DZI{|J;aBWDiFr;gDo@W#zVQiceDV!g{?ePV_S<j6(Qo`bF8|^C
zaQ#m|h{wME3wY{1ufmhp&*Gg2kD;Wdx^gr9Q(<r(0kD?JudcS{2JDpdP{6a<Y({1b
zr^^M0Qw@VXMyhp}k1%8C=U@ofxLGk_w7KItqh@x~ZQO3BQZj%?*x9I#91(&<%}xjl
zA3^6QXaZDZK7uFGIe?MfhmfA4l9^pWbZic}<y}ax??rsyA#`564Bs+e{-dXmBpjU`
z0|<2v!_(3UKi%8hPUtZlCID^e4g#(V7KX-ktv#s7bi+NefZ)^`{9{X~&GezVg#hdr
zL}k7kWrSWG@1rrH(a_RxG-c?VMyW#Sflva0a1u_2@b)H7sSMpxsD!qg`?(10K(-UU
zraXeY4}Z1?zV?3Da^0}@4#7c%mb~jK>}SuyeeD`zPd*Rs3+Lb&8ArC4p=)=iF`Aj6
z*E>My4RxYtv={xO18AEbhi7F4q4O7EJ$*YYcU*`2B`=2KWiNpHg%2YB=-o&jKZ^dN
zCox9QP8>dk$x|mWed;h47}~BMUBlkP8#u%e_vrE6IDT>uPM+R}Q-t1$QwQ+m3m)Bk
zxcFp!-$Sc)T5%II_ci^;*>2pY{<rz|m@m|G-hA=CKWM)nWX44cv;R^vb~lgv=luVF
z(^a;9W2XPh?!w%w=ty;>y)i^V;SAbRj6uV+lADcdYqx*;9{!pup8T)Bg2j)&4%46c
zDa?HC^;r7s)7bMHZ^7~Z{vKTZv-jiC-~A=L`r~iGBj-<`M9OaaHYN4kf$FLX0<Vg|
zD?qQlu9m=)!m}7{_UukOTpl;v9*^N_6zuVN`SOS1@dw}!aY~>WAtA#-O?`x$m+sNF
zaD<9UZSCky33w7oLaqfNhJz7RSmZj9Z0|v`vmebpL&%TLVCKLH4DULOA%=Y&Yx^+3
z<?Q-?WDgw2$Q8ca8~ZVN{!S#PSJ5;yi$u=^qPYRY^8-9*FX7i^K+r+AWx8O^b)mMk
z7v-I!i0nK@rFaI-yHBC6a}w2^<8aJvpnP}=+j;%k-cdB>`e2FYVM*j+B|NRoc{CDe
zjRbTf!{hpv3@i+borI$=PSDYV1f7?l^f1(oB(i9ss*J@mh!ULX_FlAf4Is#SX&IbB
zuwxWq!Z)$L53%`WB)a<1KRANk?sh`26$5?k7#Qfl07KjU5pEkFLT-K<fx`!3zjzh(
zcQ7n|@$2FHskfp2#jk|xd5<D+_Z?_b^OW<Kkh$YNr0#wYnFk&~_w}ooJ$DE@kMG1m
zD!k(iZ%>}wi!%h?S%$ag&K|@mLhpqyEP|eCnl1KcCY#13J*>sP^#Izlm;U|i`|jk%
z3jI%SQt>N%W3Ok;78bPK^k;4S7n+WF?&F?wtGS_QWAUo3{_kbFmicX%-(bN_jaG1J
zI#rIzHE(`<H|}-o3AjNMRa7XGqMDa$7jwJj1w&2Mcjl?De+;`m{xnuU@l#m-^iN^x
zW3R%<M_-2J-~1Wu{l?pIiK_3xue}Ga{@lA!Qpk6wEXIUhWjX2yy#|K70$+Vy4J?*A
zhO<<84jWY+RUV<|@_LLcE9$82Q{!LNgc5wQcoQOoS`&d5XSgSi#pqlgO(YOYH6xmC
zK{S&gkg|kRD-vxTNOg20)76JO6;Q|EI68+X(Koe#_OV&?&aGi!WiNW@tqVIbe&j4w
z%O#AQyhJ5)9>d2jpnu;<G*hi*Mi!CkokFgEhF}>ZK!*_~w0s$cyy-5gy)K5hU8v6W
zqGogn;e!_t+;axuou^RNHi_DyB_vK?fqnlmH0<05_v#+_hUU?b>Opm)4Ye&Ds7bd|
z5#~{o&7+!LmFIef%#DPq%NWC!Dl^qVr&8tma5|4jDuV<8+1lQXSh5A#JXISNX)Dj$
z*EWjoPTgxiQUqRWdlz~;JB=nPgKDZm=nW2Z8SV0hM~5&lIfnG&EW&3`pyuKwG(7MW
z(r@|z!aws~)V<)<@IC$nyjLzEdgcsLSMNgj;V0mK{)-WP@r%*=_(Pbze99=tJ9=U-
zPM_Lm0Pp<S1GsSRAkGtrr%xZm3!i$7&s?D>x#$^umYEOxGwV0kn7`O^V-I;@Fa5vR
z|Gmt--m~uJ#!Z-8%ulNMA~W`R#tPLt#r|yj!CPg|z13W6-s4U8+FW7ocQe1~asSp<
zbdlnFDSU&4s)<5)YF1JdO6$I5OgAgZ%yXD0DE<?0on}>RCl}z1{WfLax8<9Qf5{C`
zQR(gZmDge8W3R*V$6t$;U;k+=ec=rl`PElq>eqe}i(h&pj(y`7as78cR8V>viA*Do
z)ew5MRC{$w<XK;XMr%ElUagVs#O~5aXjFHh5W`uYk!?jD2!{|)CJ>B-(bSwoBAG-p
zJ>8r_M#&5~)wmgHg03Z>Lvvdm&F$^T_Vl8)ZxHSMLxy(_P~8kqFnpUr-`F&I={*y3
z=$=?W=j0;Vrk2q$yN>SVz35v%gznXY7~XRNBlOPsJ;)Mb`BCmi&r$K^>8aicG<A<6
z+Bt$i`ykx;J~$X=)^XmtxDUy5_aJrZZj=u!V|(iqDu-53x91ov$IifY`U2ckm!8pO
zSTjSYPIaS-A#pjw)9slyl()5`ytN$_tz8U<3B`Or!{$Ny04!9J4TPhIkaM=R!Nt!U
z>S{-*y%j;u$2&Wa$akQldl0P*fjb1jo<4MSb)lm*kHM}Ej4`|&;V}dKofsPK#mG=U
zhDL@kJU@f31G~_C?L4Y4pF_nR_rv|v(+IupoeXhb3+H|J!FS~XqUQ<oOPAq(=yCX8
z@=8Qs{&M6Vdl(Cs&f?(7gE&FBojr39w_iAdGlbtoPS2e^#4z_Do_gu?`EY*QXcC}b
z*uUqr@bDENX#8qZA(_9ldKPayx#|<S>EHd9`TNY{zN6oJ9?Rp*V^s5|sWH#6*@&Du
zZ%&Knzsuvazm71s$xTy(-$J9`XSS%(vbpq6jD>VvxiK<+VV9y~c<%3h8{ejzYyROM
z3-|mT9(3bA4CiMrYJe;=kI}jH9hukq*0&3v@mY<cjUR-<5rzAH=6G%SHfQ9;=h1m}
z52kC|KXSt+18(eVPPvQmoqmV+r}?dB-pX^Cx&6C5$G1h8P5aijctve1T%d3>S~UJP
z=9M`$`o<Qy2ATTqw>Xt=p2NIhx#o?~R(}Vr*ff0+<_DFl(VzC^YE86CqYFRw#rI<O
zM}88^A9*#_KK---z4_1n6lOpDI!u20Co%M?pTOv62*oe`Y(eQ6!!J_l)mE0FzPbYS
zRC`uSE$mcyjg1X(QsH_0KKKZ>Xmc~dO$<we5r(%x_^I%ksNRB9dvQW9Ni~-w^s)q9
zD?`8bOb%_iypcsinU^(4b*8J6VO}R%yZh1EKa3uQb3>yT7@ffI#I)f<Q?nSHn#TYY
zU*Gf+dZw4rHNApPhOwQ~YiOTXLHG2A;ca8f$c@aSZJf}XSVC@W4q1A7WCqPc(?|?V
zBHA~OK-UQTR9KGwN!ZsI9_~90@2*oQ?Oi}k-wGTHhu}VS88w7nNabXQPQo^{i0bA(
zlqWh-nPphq(T^P+Jt*rW{POLnYU@KK&sowwgc3sb|Ksd0pftO#EL}7iBO-&D@n>e{
zU}k1YX68(#l1Z5<W?8l@Sq90LnaX7-uCiU=F7CpvuGf95%(6Z2TstylSKm9{8{?(M
z(mC<t`On$s+jDMMdzB}Gb^aK(F*Zq-WU97ouM1gDD~baiR1;kFp%87j4<@${Zl4c!
zg3!(9-A)VP=7ry7L)_y+#3MbnZunz<L_+~a7~vEvgPtxJX2(!8J&pn@-OByvP<!?!
zYR+Cp-O(e^?%xgT-rX?o-w(~HbI@PD0rS;s@SZt^?%g{uwK$FWCBklL5_@*fQQ=Ku
z@18mA*)xwFyXW!v%}a#cFY!Z`tRJ#KNkg#=1eL!9V*<ecI_UIaeJ*n$rC|SpuTTY>
z(D()G_V-udqssMfggo~DZ^*qS+yC<KH(>JUpS+IT`2Zs7c)9=ZUR5{s3b+KO^54p_
zNyz&i?f>W}qI`dSe?EM>g8BQr`G5Wps*mpBNAc+W@_iowQW51*tUM=B`xy`JXMBkq
zqnukg&p+me@T(u5=U;!WJpWU^=h61Zd{UnO8TZNR55AXwu7rZ?O8o!&ColPh91#CK
zJY@Cn(OZ8hCq{kf!%+WZM;>R$3>c{te#WDd>0&DQYktPkBNgG#xi7P5<nbSK|3{1;
z^YNd^b2PLP|M?#)@A(m5^CN!NPZ@-v44VH>ewUO<Su|AI^Ztfs<iekR7JXlQ63w5v
zh2}3lfu8TYi0<Ec37y}02`vO*<U7wH{O#wE`sNG2R(b+GS$RyLmnqgZeG764yquhD
z+S)=Cm6Z{C)u_=KP^&edvbF&YdJ`P}0Q`wK>LeQrp{F$(U^1Iwd<Z=UEufP&)@HG)
z%*#fZ{h`*{aKYsbz!wa|8w{!Nhr<X)qX@+bzhnx*L<)gK6D?f}f=%rRG<Cw8>VPZW
z26wET(E+DYcLF>{vJK{BD-3*(G1US+-zOo%UFsu^s0k8&j4}dIM~hrGxdU}$JCGUa
zL7KN6>9JlE&+bx6uYC7G<P1z8%}oWU^<s-yT&lTsb|2PI!L7GBk>>OvjmmC4?diJq
z0c@UJ#HvIqR`EDihZ_k(FVYG0tpwXfLM)%>sVE#po{hh~xf53(Jd3Aac^OZ=@HC!!
z?g>2e!adx6a0Ac1^fX?2?ioDy+|zjCm8bFKhn~kPFTQ{eJ@+DBec@HS^2$f>{D<Da
zGq1mer(b^;_uu?HZhrb}xc=pD;ObYuimPA#0`7g~E4crquW<X@c<_zy;K4V)jc300
zRlN4uPvc{s{scbx?#J-SPriZo-g_JGzVlIh`rSA2?x#P7kAM0teDRxK#yemCB3}K>
zC-EX*`@*}Qz$@>)gJ(bXQ9SqNNAS#tU&B+cy@DrSei8Rycoz4bc?wTFxQDw>+{P1k
zZ{gnEn|SKp9X!v+@7=kLCm-C$r`~x7Pd@WBZr{5{C3y>1u3y8o8`p8|`VBmO`6|wy
zyNDAf&pwQEIC<(kPMto_?O7Z@ahh=oCr+Khkz+@(@4$W>ICuyL4;@yy?j(*JK7_-E
z4l)j??ZJZwarEd>96EdiM~|Q6_ddYI%hz!1G(Y?7MI1VElFuDy9L3RN$NAYO`MP5`
zarz7%<LmD6+T4HYMeg6ii8Gh@-di|M)x7`62^{9<?b^FfeYPCK@sp?d*eUEibPNZM
zp2W>3p2VF8&)~7EH*u1`Ywy7$IL6~Yd}QT#<k%1K7>}}oI&$<FkL4KlA3lbChfm<-
z`OCO*?>Rj4%A2_R>}$Ak=UH67{S0p2e-RI!|1h3^?QOjLkx$_{ZtuVFVcdKEbzHyy
zJg(k(8rPqA1{bg1#;Nm<<IIJ}asJ|EoV)NC&*K@D@?NsVNXY*2<0o+J*a;lrcmJPp
znCIk#oNva#gNOgi_Q@xo#*OPYar4@B+_`xRw{G6V^=nu8_ch$Sd>L2IoWZrD$8m?}
z;=zF<cxL}mJiqS<Ufh2KFCRRHmwC=#K5`td@%!Ioe1^aIiw92N%Lk6*OWgkgkM#@Z
z&*Qfrzk+XHx{7a~yNqu<ejQ(b<pq54%b&oTfBJd6`gdQ)tAF(kyz*Dy!t;OoO+5L}
z-@)P^yn*&F-ABuNx6%CG9kg)2?YEvq+qYjp^Y6ZhmOpw8UEh5J1Hb=atdij=n`v!V
z>@AgE+KROm;1w1YsE&)0iVBq1)}pdOs}f(<=w;O}SrbBoT0+h2^P}Ew<Cd1y>sH{?
zG&I1#*ITrDm}RY8+FYGp51qvX6V;xZ@A3JA@CO*7AfZP9h67Z4Ap~M!Mg+d7M2t!&
zfnc(U>aT^-Mi{msl<Yu&%FYvOr3G%K1s3Qvs|1^R!cw1Vf{vl(`!s~0Cf20DS1SXW
zqfLa}Eb@CsQQJR*s`hahhn7)0Fpt{#y~v}&E1lVnjMjc^v`4Vn;6sKfK<N3f&gQ`y
zLTjVdiLC_SW?Kkryp70S*azFa7f^cYaa5nVjEX}iv1Mo+o4b0kv8e^?Vo7XbYz`$*
zMCDpdc<;J$3;&a~mDtDs{N;bLzWND+#`2e};aIKyf=cN>SbzUW)$LM)awFBbY(;Xh
z5`5@8@e@|GQuF;AL#E~YF9z#MRU<yC6=f?8!(!URs6H~S62Fo~lezsVUm-_v;mO;W
zIdK@nhxTIV;2w-0Ie`A1%jlV(LHoogT1N)aJkX0|SDP9f9f<N8`rYt(R|Zi>!+x|h
zCJ>7R(bd_GJ$v^+7A7}Q@mn1ZRRKz3S`%GBO>I32WN6d29HeLDAUmfJnOXU2yDc*p
zTefDXv}Rk=(p3VCHFPI3MN-WPU%y`E+modTWE6t5Pm6nzaxH11WOmWk^lb=4VyI^&
zon26(h8<<+7W1>VsRXxa8JWn)%ECq&kD8f-ikb%6|J``$wYPBL@!KdYuSZn_-&<IL
z4Qbg}E7LHyWvSefYh>2arY%^vIUTDvrePD0-{kP3vAqWlUl=(><+M|qv0?L8m4k8<
zx10Gr8+m+^CQB`|usIFuw`B0WxhSpC!xe6VC*A>*mzA{1htdW+%C%1DSQdk+UbGI(
zAlfzrYp@x5cU-kyYw0w!Ha`j~8?Z@MIFtpxa(Vu;@_7Du-uOL|R!c4IuyLardn*$T
zWa5EjHebnZzA_16D_@_Tor`q-zDJL(UArEcnb~mC;Y4{)+ISshWjVF5zu5%4g^o_I
zQ|a5ZJcs73+u+)i0ss1RMAoGvz99ptP2148B@4}IS?J8l#&C8HmUx`|wiV!@L~arG
zmXu+CWfe|RR9(@Taa(J{WuqO(Lw+2Z?84#O`*HfcCvf;n&tcbBUck(kpT_uaJ%`@k
zCg8sQ6xzT10Lf3^MCvoQ(emXd(fGBe5dG?X#J=$qT7Tz7^nU9ltdb!rs@GO*ZTe=l
zE=6umCJGB=z1LzCm6oEsx`rxFk7^d0I+Gn08WXBC7XEERt;Gqg$4|TFfh(1y^5O;H
z38<mnZK$n-wx$j`hDNru#;UN>-ugr#1_(I<FG$N5ic{q!2)sm4g+I>c<Fd~S9gi|%
z2oip>wreof$eXMY-f$A`2qWGI7hz|UMROPyszftwpdpcjmhaK>J-RpnD6k7AP!@<`
zt0#y}-U#x;&1&e1VPF<j-Q%bpnnlgn0xAZlk(KBq^g`HZq+(;FnLOBR^<pa(UWSua
z*XqVbA1!U99qY!HQFiJEwk@AT`mR&(y!A;`J^3Umu3Sdi(hjU@=|)=bIJS+<ViRGz
zionin?!vD74-}++OsKst6cZ2e2yEPyIfi02)dPG^o>oRe{wI6NH$8li|5TgO!$0L=
zX?uIrYbL3+mA#QC`M+O?6;)rx_?5guj^G!B<1g^ovkx$K>>zp%>_*qVUFc<WEX|>H
zW)iKFW9XO~L(9kj60*WtOA_8_04|>c9<PJYbD^avPT&P-*L|3qoyOqMpjvB1XEZA4
z$(k#Y)=Veshp?chrDqako3Mf4$jUAt4AObCW)ct?D$&I{1q|M_X#~dRO<1#boeCLD
zFY%~DPKI7dY$Y65uU@65Rj%QC4Hi2ryk?TuQ4(GhmDZ{`;9GbDZcS$peCsyx=2r8{
z3!vw9KE}%Hz|jlXnpJ|5Djl*4E3s~CHrDgFR`WBrq-9_W!LVUVIxQ35yCoCb3d&e+
zn=!q-2VFxGgqRjvGV@kav8`T<b+Vj-8uTxq*{BkE2zb|RCV<m&QBXyP6lj7aKzCw~
zp+w_GUbRg{na;<@S;?@Hb|(i=Wr?6z<3WYNPxvJiBunbd*qmLAjT!k!&n`qpc0M-C
z>?y))1#|?kR1#|P!3ykT)|Je@l6?W5WH*<s{4F!E<oR`kyX;E>kESMy3%WT;E-cqn
z=Iu)pjl-yisk|Hp0?oW(3*2iqBeZH05{xEB>zXZSU%wSyJXgK^%<+v|uybPu4iJV1
zSpn_m_wLEd!GYoeoGdQJweo5_Q{I4QDzvytk#*8+#Ide8j+~gqzK>qP!dITg6k#{`
z+s~kv*0uec&oZ7v{PTB_B=p+8_yA2`zK_V~ZX)p6>xg~%E}FmoBvw5drjn^TE;5iW
z>qZpjQRPXJPS%RjOG|jw5O_5v8|o~smC5~eCTJZ&n1XSbqX{$+V)}3hwY0Bg4YDwf
z^xoD&Q(X&PbshBe8kmhHINdH<+JMTe8{{@bB@m8>2s{CuAKthZzJyP0{Ykzq8KTvV
zAe@LI9HaFO6M~^Am0?7+1$t-~J*>rDGD)A$IcSfaw8t(&Par7AW{MJoEQsbftuCQg
zA;VD!>nv9QInEH$%wBBs#IV5@K!z`gs@5Uoh1-zkiebIgPtf_WjoVB!VQ2Fp)8WHb
zvjdx1qpxy>v5FQuYwu}f9e5n82KHjxp2x85_LJCn`wj|j-^Er+{8b&JDA{!a)|<~D
z``{Ur9DfWYI}RehwF`UsI#Ff9ImIAh(L?A-dztilsRm$fA3ZIh8je+`0k_ilq5!C#
zQ14ao!2|Mv+<pK(KB5|q<#G88`6khszfzO9FFf}o#;EiLj~_zU!M$kRxrEg0Bw83<
z3sieE<7gTkK%CGEwWZ*V1$aH&yfz-hV?lJbH^Jwzqq#9oySX1O+Iy++%#xj3I<mMg
z(375h$(X%?H^8dZqTaS3hc&*yPoO86wq>aSfrCIVeOm_AOQzZvl_WQ?gmNsbz#qjH
zS!FLHU8yQjSu%b!lxRkNSq;@%4pmSY$_cW{8Xa;aQ3k;$YHcmm$vUc$wCobp8QidY
zQ_xw1aQIWmDXyhT$WnPBMG;COT><45D%5oZ-YUL#D?u7%1s7}Wfyo`hIx54}8`H6d
zVB3_oO+ixB-U@)WaG&aszf+EBU3xyY71yEK9E8!=f*Kp4rw=fKDoPE!zw9aAb6wE-
z+L7PDvRGwDA;G70rC<-YD`l8nT2DCVVzW|wc@Lo{s!r6LBn}akxmhW=-<)2WMu>}=
z6O|{A3-m<oJ(`Ltk(HH$6z?s!$E&WPo|TuyZiUTmhSOq%v%VJA{CwE>eZIAu5M8wq
zsZ|@$!f0Q!37x#Yy}Z_ggy=LC@UHb~IJhAlM|kcI^ZO5GZNp)@%*$ovxJ8xxbU`Iv
zE^EMxwMN{k)!~d@iydJn77tEg;e~TpAlN3q^E`UK^$ePR>j9bxK2dk^cW<JRRyX;D
zyM*2igx|Z0=;yB^`PDmw-qy`(Xo~dI3h)XGavrYgUyQQK3bnq2vb9ur^;9+*haWX&
zH|i~3p6D=YodFmK7H3;K+#T(xbvse2)u39Vfrg+n2<#ekuo_Ho*&S4P(hC_@Q>i2`
zT$r{f#2dyR^AUDlgj+*MbSL0%^26Wgr<~_&l0ihN_#}-|l<y6PLbSwT_z64-Z!io`
zK-vYv@P(vPBm#dVK^PK#VICXRpo1#U9&LuvpFoW(h%%QS#ZEu+S)&(v!pNrj%MQem
zO}m<B2_wr$J4;);fzNN|Z_ak`aVsH7KxSJ!$nyk{8H{3Wq=_o7A9*LQVcXFg$e2Bk
z!h?5^b@Mr--FX^mr!Qm8`~j?Loxqxr-Kf0!G|KM0fU>(Uq582qC?1$X8ISMC_3QYd
z>J1}<NH#%{08af=Jz*=VOrZ40*~)7a+a9t@m?x8YMA1nIES0j8&y-NS`*km-`lmiX
z{-?Skl-ZTei%Ty(hq04KFnH=HI%#p6b}S+}KaG~TDReH*AT>6K=s*_&9Zm2yvZ9H2
z;SA8)hP}KgB8XG@`KkJb2YWFyJpr9Tr&iK4P$B3|Cc;m0HyT;7mMRspX06y!QE!`+
zwH5dY^u*qZ3S3#DU_DjdMuLwp+bW4KHY<B8N>AFRS613v0iIahtjui+dYgGOZQ^Ih
z=m4F~gAM6<RA;#;=FL=HZ$?pBol<wAw$@N}Y@p&=vpEN4H5Rn@PUGf-*Kp(hD{zHc
zkV){ZCKNYkWMfMP@0YSl6jjt>eR>WyWfj8gjpEefw-9dbMzz6#%%V#5oGhKNRX|A?
zh}x4?14Z%4B3iPXf~dSTY5CZaUyY&$C(Pkax~CT2U(Kj>wV>M3gmOy)b=)^a`gxzV
zq0}5l8Kc}Bh1S~)SG<?^XAHR&dVXgC6=vaz+Ix5ow{A_N0;b(p7WdamPk<+IQ|s4o
zEASKO3GC!~QF#KqM`}+jZj#_rid}MXIweb!0XDN9ZoLN9qC!{*bT`#waMgN(Z!Oi{
z8Z@q1helqnHbQrRpF2r4xJ2OXrP@2pIL>o_lIP+ykM$yLxj^s9@>)Di%l=fg4o|Rx
zx>;|)WvdIPcyAw^8^_GuqnP@_J@in)rM`Y2Np2Hgx~HHQ{?t`?KYbOx&t66F(~raV
z$#d|0>Ks<3Wo%_3%TT?v`T047UM{zk*@XotrBszc$K^Hkw6_MTJqt8cNVW9XW`e>%
zc$DZ&sI=OlA<Q&>FN$><RO+;-*Xf|u>QpbTjlgqQZSc6=3VH!5yfC2`2~){M3AdOR
zo`ehjHZK~-8_~b3AJINq+fFZBtxh<aoNzaJ6a)i=qd!bQ2K@wFi0UE)w?D|BH710@
zjG&)Tl?X)m7$L}Tg<@(9pxGUPhH9;rinWrzrOfU@Wgx1QTMhwUVhy3x6hOH>id@$4
zn{BkQyiqa;x*UQ&&*J0bUSv5uSZ8ygetst!j$c6j=_|-Qb`^Ce9$?kreylxu70yq5
z85M-ws=j%w>Y2mFou^TJ=Oq+B^&u2L{R)cCUqepUDC!!!U}|i^(JNOKbOdMuHU&Ke
zE$&G>AD{ghA6>D$qRSpKiv$zk$=}ku^;7Pxq=u4b<pqp~(36%tWpf`s{wvi*DLYh@
zzmRl{Vzq(G&p(A>THF4EyV0?05pBzJXkVH|8<k$$^cYg3{fPB-AV`(xBjj9!o;&DL
z*<V^(5(xO5RCzJ%Se{2H=!2cWll4}#2E&Tdqb1U^*3Qn!#cJLR0zI*}>v$t%(Soj7
z$D5JXGdrh%@o1?6f!-#i^wKl4l)Y4KVzh~yH*Hjw_R-*awVEEGCyHv5B)cV`^t@Rj
zO&!QBsX=yrCGrYukWYw}RA^948<n0@LO9XBrcoVbmSAIgG3sd@ukxCG=&dh6%d(MK
zSck3IWppwIzPB8iGS8f9Dzm5pI!6E+dl2?eGq)kCwp!(iHf+nI{iRZ(nv}I-N-Ju3
zOau#W?v4Cg+6mWZ<YE1`BBU49p_tIqQ|UFhn-%1$9W5v}#F5t!LX}<GuE$YfqY`wr
zLmTKp4Z$d{wMTnUXN@4epbl%c5bOl$7JhE}wp_I?jezUXe0s63kCdLkE+;1sxw%xD
z+jw3dDnAJUpS)IJDD8Uk`ihDwbaZwp=qaT~=;@3)m`!?kOoU%~8BFQvaPoV60zO{f
z=o*5Ll~am}yp`9ghu3<T%5Qq@7A&vZiv8=;acm=jNAR7aYQDgV;(C55t`?Ny3b(h*
z>+vKjsCzX=+^EpuZi5RaTt19Xbz<T9Q<(nz6Bzo&Gid+n1H?aji_p6P&s&ed^6`uC
zeC`^epSzCGJD1@9)CH`{EtEAXSh(n=a|`m+tc$#SvA2b4ZGzHL+JN#3H72jJsv6a_
ztTnW|8odEk4GoNX)rM!GZIw=n8f^m_2tFy)Hk%c0my@t_!NskUD$wmE_<UYMiSY8d
z;R!k6N;qIn7?7Ba!8aE`%YkMDNBLMs19Srh_;<t+Ty8|?fkCv+$c+|)FG}#y8au;b
z7Bbq}V2t34A>@l9=8GWV3ov{L^E+f<uQ%X>laky?btlU&>a=F)%yv{*Y$$cOsQi2=
zCE%(Fx++TmrL?nocG_A#p3c~2C+G;vY(Ac2^AmbrwP<(w_#C2lo<hlqvnV-DTf1-w
zt9l8!xkD(r^Z*(A&tui_B3A7@gp5nKu=e6@Xy1Gnx(|N@+YTH+ep^54lU*>hcEZ-x
zg#$OQt4@a>QM!pL`;`p;_yrLtX|ahiF`z3dZiQotUJnT-gDO^3-a|F_5Ongiyk25u
z$T6RI^u7;5{<rd+yyo)VYZ%`?i{Zs-^v_M8XLbx7lSAkjA4EHW*EG<HWKSzX%`w&X
z>J58Xf%%X~hFO{M+R)necDG@kkaY2;FdFqr*@>l<wjdk9S5{WWn<<09leH<<VAUF`
zHU+%`Wyxe;7LzJ0T2X_d_Cy`Y{x*V8LVzaiebPXmwsj>0c;a62a&y!TF3^-!`L^)p
zw|m3Tm|Q3*sYgBmwqZ*?@``FvSlWQHY9lgo%8`*rb+)yDQHY8<JMv3;Gl$wyT5Usa
zk(T#M4RZJzStx%iRn%rGxQ(>DMYLefSQpRFPV~<ngpGF7PA9OA-@8V@%h#@>qRA<!
zKz3mzEEEosSa;3VY^3Fv)0WopGt4NU!mKf~GEWRZ=WVC8jnTqJQKU;k;~jt{Hi=4G
z2g<FjsCITh=kJBV--QNe6G|IA*jCtpwS?Y!THG|My&Or}%kP#Zg!LOXD*(#As6RP3
zxp{@i%Fa`NZ;|7ab4B2Z$`eq^{v*39`<bk8;_(E66dj`8bw+~D$%>3ymvm~75vJTc
zm{{SM*RF$$&<k)ISxfMV(&IJk->?;<>(Xd()3C@8n|okG8jf$yz-cP{Gray6dCXVx
zOK`oo64y#=aGl`0QAOZY>G6D(0oUuaSausRJRHN&;|mxk<VL^n1UhMHJHGrRVxPMM
z=eyTnf9EQ~pSgwPdv{bP$0}K1Co4Cbk%Jt<E+;P+Ik~yNhF)<A$_c(os=k`aO4L<X
zqrSEl_4ReAsjE?;A>^b7$H*Top<qYQdA(F;gquK4U?<R%wjGbh4Zp`j7&>6{TVRdR
z>ZEjt%!Oc`vBS5+3qyMa%ssWxPnpnoz6YK?Nz`_m(YB)-<GaTZq_PN6F}i7Uom6qI
zV3H~_fv7Krgg1m{T4%r0NL%cNH)@04rGYQ*rP{E;VX?wS6>8S&QOlaCjzBKen`qhH
zsM9%BTTX@9g(51s9J`Bvb93uKE}@ub4<MH{dZC}{&=bNce&5EfL99wPQ_b~bP1g|C
z_f4VU^i34(zle(CS5bN47Al{33ad|^$L7cGA@}ApD7tnPX$!L`CG>Pny)bt6p`o=E
zp0OdEdg|_qh5ZHbrP|yGA{o0x9Q}k_dF1DhWZpxt{o+^e<EK=EKPB1(ZXZ0ykd6)c
zJn0Zo?SK;U{BOnwZxGnNk88IsV`_dJW3!_eo*F{mcpo~4c|#9&qNT45$*v~E+mouz
z)gKY{?NKT(P5??rhQtJaOMh=CoDQpkp6Oxdh1p_(fr?KS;TGkwdNqN!dM#EHe$q~s
znM-R%WhP6B3jFd5OO(>vz~=;fYB^DYMxd9zjkc0Eq$s+TB}FA<c~Kc>%GanB_j1+i
zvUmfTS?NU^J5WH#Z6Mgvb1IZ_D<f=T&3!OBBXl$_Y|bi0VWk13wN_N?JZSEo#Idut
zv18wPG*JCzWR@Z`yI4VQ3xOetbhi{#qHSb5j$C>ga|bRV(m4SwoyA5f*DXBOG=c9{
zf|%A*>}O`Nq+F|H8FZn}>_&d2mhh{@R$AO*!qyyUhgsV1JRPWWcB0PF4NGJa&c=DF
zyB^wJ{v92KuVn__rg2#K+G;}(1r<i*6jPCA6;W9dd{mm!i<>5voRx@l+=znC$t4gO
za()DOk}^)(7!~+<z6E#!yA|mDS|v)RCAZsycp?E=R7^TE)YfQ(OYW`u1{ey9pl5}q
zrzN%$dLD*Se7wfZ>o=ii{Z<SUbQ83<vm4W~ePbH-Fb-`Z_^5(U{|5A~QAJ-Xsm9gP
zT3jt_!0pm{Je6NT+gpco+Iq~G>(JZk!SK;>j6Qc9-5<Mz*7t59K>&sbJ|9(|_w6eP
zee5dYZ(ql%!V;+o2)u`|%O#=n^72toP>7<UVw4n>psb_}6|}ik6#~5))JrEteZ3lp
ztfh^WA1i~8rTg0LauG&iUBie*B5EB{>9mlQ-CeY|E&|WPeR<3mpz;b+0j6kS+iXZJ
zCE=WP!#(FjL#hO+d0O8c0eJVspdEF>J{Cm#ayNp_yt!zlQyzYgKSCQDr*cdn;)^56
z+OyH;M~n(Gl(Zu{OzS)7hOylQS10YLwF+8I8JxOmST)=?)<I`8qE-e}8*Q-Z9k3F5
z^*S3WY2!;Qb`)A2D6%>k9u(RA$S3qN9W3|+-WqolYlDqg9Zq0fEXDmOHpg30*f)#S
ziGHL{>_ExUbEvuT1hyQz0L|SuP<QPm<Q_PJ+^H$l_l&~QF$Blp7-N{QZ$q%Z9~W=k
zQSg$XC@Y;74DKr|$^Ra~>Y+6ijkeO>CtE(e!mpJ#Jba&o`o9bwee`|_^-p#ly@}Af
zc?nZ<qbd{2&_pkKhC9&S*MjD*6jJT1Tv}oXH%1VU&Zm$Y0aii+y--LhGaH&xF)YkY
zQDys8&z{L-R5ymKt86rx)%vxP&|+l?#?@-Uc!8euv}O<@>AVr8$5VPn^H~OESjh)!
zkK0YOw(`1c0y2Y^Oxmkt{p8%7Yz4fWtSmw=SC!>-0#t7LoI;l4Iz4~GAe})9n{&#L
zQB;F+Esx0)L)-8Sobfi)QpuMaoG8}V)ZFcQM;O&6AC5hC7gry=h6YQJV6LDus=y|K
zB#p<ARa%7tD!t~xMYIg>ge}@f_0^(2M>6#k*VH3RhUG}lswl7=SuL;-dF8y9Oine_
zWJ_+TvbdY5Hn$ZwsLlyTtPiGO4|KjB0&fKFra6j)3Dmp0q4jpdkr+j&ozP1SL+ePP
zw9bK?5)BH<HOM0<@=K~vOgmj#S%booa$5QlRwAV;a%s&o1%?EkT9jYTD<P+b2hq|B
z>?GveN~I#r2+|QEf7cOgjZMu|cuv(^p?YymyvJ-56*@gEWfd^6V$u<Crq%1<ShWs5
zMrieVB-YU4uHA(0^_wwB#Wz9F%`tW`_OU`c!ecp3wRt8j3zxDBaD__sa&8$O&nstC
z@b3!T%qhX$!gAayt;Tu&z5^67OQ{eh4o+b3nPW)3c^=_+uORyFbp+qOjKEv0gx<c2
z1Y=ckX)%Ai6plRByakLxs=Q(q#k3WYLrxZ~R{7$pYtc~G0F6cqy@8;UAIKjq)29S#
zvU+>aA3!urdl`$Xh{|AN%3vR@rohh2t)B`__T7Y@T~_&`y>+*{(6rbHKkcn|ijZoo
zLVVJL_9Gog>~Dp6gthIiZbV0t2sILFRF6q_01?`i2yLw>yrd_B7Eb_8Vrj!>_&aO}
zFNG01+=#$#-k|dV#D}ON9i`A43wW_AP;D+ng{cBMyABPkHLO~ykp{C`nz2r2LY2Xc
z3Zn%T1}nD?l(R-IA^h@eZWPc)XM5tvaL14nOdvZDMY@+j<nc%XgY-ZL!PkXVor73A
zHjQ=TODI@Ai>f^rkv%qxg8o5hdq$vd?}xE(2(HN~I7e8p2L{kIK81@<KC8ytNj#br
z@xSSEe@s5DbZe}H7+>jpki23NVvpr{`S&ZzykEXgQ~bUxOe3)4n;(v85zPJ9e=E>i
zQF@P4>5UM20~j3dR!K<PdmGW*$?Mu8GebfI9@TV|*EHx-N>8TCc|11SSqnx-1~5K0
z44c)=3X#w=Qw{LuFk9`=>5V8UDOESnqvaT-z3vh81bA7rT%z=3!T4NRONA;)Zbqf_
z#A@;Jt-M)t^9mH$MCD~s)#c^nAd~xfIl0Kr&Osi_bNV)MGDQk{xwJ~o5ETk5$)b9*
zQeqlcn1HjYY&xNi0n|D}sI>Z!-(W$d#mDmD#a3EijVp>+*BIdvQAs7zX-zj#jck)e
zt;zs~KzYCEc=$Q~mJ!t0QYh92Q148r&&{i<r{$ztB?Qu`zS6Ubv4z`Q-mn!7Mif_T
zus$PKb%dl-Ew0~IjO;QUYAs<kG$qzOk3{b>QUiO?G<<;Dy?ksDp414T*iReX4UH|1
zl3H5vay|0PG)N~TGIC3hT~MkL^%nCUmMrBtvb-wKZH62RD+95*kH$g%W;_C&Y$cbb
zG#|+Il8}ALuoR)i_0efbkGynHS!4x2Zk=R<qoxifLRClgre969w`x7yt2V&Tkj{-5
z;n%cw13C%CepYTWd}WSGeiyIp0bbj~8@J*Ft?v1Z99-O%k4vl|9^1yqEM^pOKM$AE
zb8v%|#1n;7fK={>8Vs0j454p(AG&VuLF20@(e(BuBtCfsiBDZa)2DBt^;5LBr6nb*
zMtlUkd>&|VQ5i~#%TdOYRasSw+B%t2t7l=Qg*91ePpF9OPJ+uvh{+)10KrE1M52UE
zL~WDF6cUM~8ptTm$!k`|<^>U;`ctzn;!*f$kG#!%e@_^#3$0XpjYuxguD0>}B9(~F
z@-yeD;$~84S?Gi{ZddBiAM_Ehq89mCfrx6yYozk)pzLn-5q42MG|^Jnm+WvI3&3`e
z_IkS=jSF5>y3$b_$%k{ugo0=>DpGZ@61oj8fw>082AMmoC-@9%ZCZ`iNYI;k4lD$r
z6*UB44b@8pFLD_zaiPJ5LL;}dzXiS!@(JHEXA{c(T}Y3#VO_LI4Nl&iXu*1KBMMuF
zP}JUwlD1ChdIzEJ8${heA2hUshVc=Y$46ls8-;yh0!Kdl5y)z;vdG~-GyVrd1~khc
zWC_`qaen`u@wX4-pBMta6+<j=qx$>h!<IgQ-p^M$G6Zq5{WbjLmC~a3KCa$=9211z
z2%$GHA<%0>M_(gaI$622glG|E5IHL_Dm+hs*Mn+LvO7DSW-5ITcJ5e2b5jx)Y3Sy8
zT3JlqK}BFg4J*#9tZb}bw@$5JE4{D(C-m|N5&@p{cFMZQdAwOgk%+Zjv0+<P$nXkr
zB)J5fKrcHpOKo!rJ!zEBp$f|@DB%rXp#~-A6Er2YIxHVNiG$~^z(98(8E+cgVU+0Y
zR8TC(-6NRZe*zU|FG{KOMD1-UB%pYWN0;{F+Jl#|xaT->OKOqB&y$Sb<#Y+AP$Q#*
zpVb15yA{Sjt5SMfXwf!Pm2S%?@CZtQo(y!B0neq?T4WWJ6Npr-`K459Mr4)L(e~C-
zg*KqZ!uzRX3e7{iknG!mc<)ZMjqXRs=q~u^BpR$y1-s0`dfN1Ieiq?YR<AyP3&HSc
zvHkRHSvbE4X;gL_2|1}W9-Tu4IVxuf8OXeapcB|V%BPu@Cg2kYO8cJls`I>4B$=q%
z<Bd&(o+v$;YRYq<H^61Gz}29Gg$&Vc&7#HK3d`!vaIV?{kBW^55QO11>ye`UZDaH@
z##o`vQ*8?P4)XYpQSC|l-ubj_TujTuW9bEWoRE85fsYC>yAan2z$+O=xR_mvv!!)7
z!h3a&PGfYm1wE&x(fr~Ogx@*?|3}XtO!e3F(Mwc%^xnBFI0|^YDETZPC4^l$C1Yik
zwA*RXK*ggs^9R|Su(`Z&c>HRJNI-^+$nX$Gls9!e(WoMkY(lEBMQ!DA8CNIM%_P+8
z2}UoW=Z%KpiUwehx?xWeq8+rr-7eZt8!fLMu0|s|mpc#{qq-PxgrkW!mh|8zq~V;3
zFdTwgRHOjcA3~B4ZuL6R7&If;q=#)l3on6ZKIumD-F76-grV;&M5%uxsyYkc-Rnki
zUnTTYc6cU2us2I@6xE(V3$ym&Y?1~&O!eHdFdFMkw8pYttqHX{8J5$42BQ;Ni=P(7
zjcTh46@+dP?XdKssWDceCS=j_7Q|vGWkpaNr3HyMp{lh5m2K^WULVY(Bed^BsHZy8
zb88$Mfn|CMuAMs&J9`!juYVZlzW6m9{@7=*|MgE{$BQ4q_LpD5%(Ks6{K;o9^YZK1
z@y=(l?~7l>-uFI_-EY2y<FCGk(@#CXr3ZI$`H35N;<*R-*6)1_@5`!Pq}(IuJp!Nn
zTN%zG9U1TA^6kqQogc-(R6lwNy)HtpjnHfAh#}rA&|~El@u-lM+PnmwTExh0MR!*l
zmY3$>b~%*F6W~cjX<<dFF__f6Z5b!Fdey2AmM9Q=yK2oktd?OBR3JrVm1<m@Ku_B5
z3V8#GMU^%_f!<2*;}+GZUR+#)Vp&y>AQj-{WaX&d$ZdR|v{{vuRiUW78iiD21!XnZ
z!kZ=4Ie?zAdDK(UmFpcS)!7vQOLYzuQpGh$k_v;HYKzyf!H$izoLgwC%-%TeKKEhl
zI&xl(hbySjB9ou1W2I~4wbuFDp!E@at`-HoI(tB^|C}YcB>9`tbBeJouZ-o!LFgH&
z#;E#a_5^{ql|U@1mR0@C$S!F>PPq<6b#|0!JgBk8q4m&u+nZ^>Q&e_@U5y#r3TjlF
z<c73-tlyH0&FQqOSp`&g)k@_F<YXM7Yz2HW)J9e^lwmxNRGolMX1n}mJObVSTWn=y
zQ(c!<(v1;%b{Cx)udy^;7<2|T_0&y)WUs7)f$G(;DGTOx+u&H8LA93w7s2OUyBYp<
z8xdh8C*W&kh1bW5Z*2Vr%=58bRDK7j_Kwo>p4^myGn=z;ehXC}f9GQvg}6c$dzt6%
z3eV?Np3lov$QO&MaIvfoCn!{A?IsL%(fRG~N9^((B2Vl_?Ab$Tdg&Ba6%uw*Knf_o
z3#ssmSR+<eH=v&Ou3m<yum;nb9Wc<&nOr{DsAL?H5hW6*wUMkS2?PnYm_V+n70Jd{
zG`Du5vAJFCM+rVjB;X6j;E_Sa-1?$%1gQdp$s}v5C_IS}TuEA8+FWbWMm1-GD{h9#
zM*u}ls?9Iimx43uqOBr0BP-L@d=aX|2;mv#4Jc#&Xs@CX2S3*gdxuVSh!_t!&~U_p
z_6MCvoeiPTyBZbYji~I)Mg3e2s;28uH>QJq#Ho6a^`>fC+&b8`8aNuXa1wZqIs%l|
z$5dxhicg&vg0DeiLVbgS3fPM}trhjsR%O)@czRf!9yFL7(C{}GdpxKL`B5JB6Lu-&
zM1v@6ZA4W^D{8yjp&#gld3XTk!G1W0hiP#~VH_HQZDAhnBZuKRcNYG~A4lN&4Y;me
zgXiuY7;auh>4nqCKXn?FS8u`g>Kh2Y{Vsg3eHiY$PayW#1vDPnkD+6GF@A76cAPwb
zcfRx))j1-Rdj!1y6MBz&fG<CB9m9(g=$RQp$3!n$hda^K$7|e?M5Kw8Oo|mx(7_7K
z1&^PVhL;tU%fbrIi^auRba!{aWHRtZkflXs-nCSk4pi4`P(T$Y(AywGP-IqujJ0E7
zm$o{Y(j`MVN?7S`rL|eViEv9>fnYXmSsJaX^w>(Tud=jRe2z-5tgIY`tiZ+6N_oy>
z1ul*CviMX9p;Yt`cw%qG_LUMY&7DJ--+36NHAZ9?Q(X}tb!Hz5s*EVtI8dQ+vdqV!
zw@0Ybs`<W3TI5<d{Y{9p4yv}kB9;eP1+apOU+Yi78tY|c&FdHHfi2bpoi~OIiipfo
zDlZ;SdTuF7s<hCUoZL1L#I?vPsX`thl_o&UCm;)}P^qD1bkTa-!^kYtBe%+mYD)~c
zH7;z<uP3<6u#te<l#x$pO2-ek=`utmTdiNSRXT2Xz9gZq49xyO<*`C}1ieS<PJu36
zf>6sK@VGDSt^YM-EZ<7V^Eq^enN;8+eu9x03<RCUYUedK!)Bz?Gg@hJbube~#`Hp1
zHs-;;E*GxVIfULexYllkXZ;ogH*7&{{bn=~bRD$2GH7{%vB1A~5Q=-(ZN{PXTXB5j
zHk=|n&uq@d8N%;8E$_)KnK-#Em#{0xV?4(v^NVqUzxQ-)Jr313U<b?JOo-~dKZX9?
zEXU{P(RgPMR!MtE4iCPFe!qmZVR>~uYPGZ^E-EB<5GIC^`$k$vvp)iBC`R}s;Ue_B
zu@rpqMuZ8sSW5?M|27p(?LA1fc2N;frKMWo=WD!tkB71H9!VcX5XSf!GQF&c;B2C$
zjeFrvc;JdTVD=gKdIwx#2`3zW2O`M`LNYg(kPAj5gl&ZI3$Of5poFC{J?@0B$%3K7
zJ!n7G4$Th2^RN?^{j^t0T2wV|Lt%IWG&2<_7|DnAfD=W%tigvYsPR|9WUqr+R|~sd
zOYjkfRC;#S$QG(RY3Gw!DdiU45L9(lti_9s0hHOoD07HywxiNq2ffpvhVSScPQJ#0
zTCW!-s?sW#6?Igpwarn~Hz!~s7|cDi><o8*FTDMI2n-IwJ2(XQ*cgmdfR^PYnD*{Q
z)%GPAP98_aq5Wuh>@4c8J%)-amr#EBI;w9!g@)%|hwh~}V0i9jSnuA0@4`7m4)4Xl
zsr?u^vIDcn_uvy>{VXIs)c+gs<e%(Ii=MpZ@dvjtw0#zxb7N?k97N-2FXDae2zRg&
zYm6eC48ZNPQn^#%$vPe$tAbuU5yp`t2N4MPROf{ZPm%eyVrz|5z~$9-$R&`(-ezWO
zqf*OUQ4j(#Dl|2xl?ttlE+d^c>3UimxrwDsO4_5u`U>=<S5_Q~Ku$W{Wb9a3S%n&Y
z@yOEV(pC!aN-AnmTv4Y&pjS?NTUb&9gV}?owgFo95m?=E6jta7KaH}AIVCdB+z)rK
zjjGH~l|*P|$mi){v_)|8@&lZ>`ha#gf~}>s$Y-Tz47b7(?PcXX1aqVh_IMxk{si*W
z8Z0KH$uUuhRBP>O3TG)RQ=`+5!peGOZL@i7ImOikUj@Hk#`sxT-n>d_R+$2*Hiu}l
zHQ2yoNy{lwLvS{;VvyM+8)#!??4O$PLFFc~mFIFB@9k`!w|t73JVpl3&juN?@*B2R
z&b@-&!;o1ok}+N4H=&o2u@WoLD^!YHu4$^dRYA{W7PZHFm(X*X?6BAK`qQc#v&v!K
zQViRMLb%ox5PZ3?uc6{wmj?fa3`91hAxW^c5rVx0;PARlm|`q2cJS|g(yL489i!Si
zPUxNB_SDuKoXW_>@yuKtW1M7#a)xj}#R~9vK@m=uR^cdPKh@@v(TJgl8|^(Yv@G{v
zRY`d{ib`1<N}-X(v3ZkAo0VR=_6aXZY-ymnvk_K-_AWS5&G0l4Zk;^{5p=#(3;c{Q
z_Y-`*M7*tAZKExn2oZuozFs20kk@-@frE|BRAd6f7{X1BYNdW}jLI$Qhl_N!2c2*R
zsUm_NxV<hlpgBg1C~bynDjJ~|Vo2|BG#EsJ&nJRj#3L@%Yumfniqv8ZfgM4#oNY(+
zcmmFyPE_}mqpYV0`t3RtPn4p1UW?MfS~T<+Q0cFP#zAPA>UjZWu%!_uy%7c~J1uLj
za)TKq1XxW6?aRnG(vpPg(m~`7PGfyjGtzndwP6o!t^rj}7F3T{jsL6nhSa=Ihsy(<
z%}iw*fsZQJ!5T8y+J;DH7XmFUi1+p&IXnbE0T>t?hIe8NM_+saH$V4TIOgY|pPhka
z_f9k{EkM6>H;j7@!hGN~%ts%G`TSj2uRaC)wfpcsejTCX$I*8D5c-ep!T9mrm^`ru
zAN|^ARcFX=w)L%`ZUuevf9V*J;Vq9n`2_lRETDCM3e9s9Xr7rslGZ!i(*<u6D=_{}
zAC;bjK+oZ%I;XW=pvoH>qH2}#TK1KM0y<h+i_OU!!=@Hn5_>C&X1DQXlQyENsw!w$
z!4wwDN_nNI)lebIxJiO9mr6|r@Rn2(EWBxDaIXwMQSO9=TRPi{`MxTaU1@t3WtT@=
zmsiM2oCUs=YN}3WQNXJp?4&ods@|Yd{g+lWAd=|7=+q9lLoKwGHbTjOt+}<xD&}h|
zO#Gdl&|4!&%cfe(t44lt1EFbyuW<lNM=zn67C4u7xYXi<Io1hR(<uDyGrV7h;AtKu
z1ly3sW7tNEy{)7Yh1CYs^R+UjuY?t;3}`N*BHC6|iK1#P)n)}s`CLJT28C5RROy|F
zwhi(=j1hhgq!rd-Q#O@ydiF|Z#6#t~(y>E@M^J5)PKT{By-g|%fnPS&UMW>&wOU^~
zEj#Z6r6)-feiM2bncG$de-mtvz$X<`c6OEmUb+nY&dFxwvtpSGsbG^$Evmg%nlKDT
zGp{`b2JaUe&%32k2VGVLbeoD`*-!%a`f_;Il~Cp7!@D6DzV%rM()z~MrlXM%>{z!I
zeT3i8+D(|G%A4c<(uS?rK``#wn2v*6vv4#m2S>ML;~2qrG>u!vX#((Eb^*@j7UDE3
zy|b+FPUV;3SV1}V^4!j{%#GI9W5{g8s+xudRMxQ|K3v?Ujs>%tmevx8(sH)IKy|0}
zhvA}K4RrUx-A3y=!kb`toR)JC(XL(vz-UJgB5hs&B|?N=ps5{xg3r(Q`yK^ta6r@{
zZE%G9(Pl!FwMDqG39+Uom0iU~J7^71`T2bCc>`*8MVv}8LaQthCNKnmu|OCJY0vW$
zdQ=xt86+Eb!QbM9tDRQ4!vx2G4Gry7u<f<b4d~(D;Ya8|fDoZ=U3S4Z&fnWhNV%&}
zZ?5Lese#d?Q)9_2JRdq<Jgvor3Y!bLejhXw6R4TrfuhCzs6256rKhf-;>-nT4jqQJ
zvl~^cB&rDbGOFFISQAR4lF&2^r`r#o&jW|uLM0R=yklr???f!sh|ZoK{`N+sdb<(r
z?Lu_07pLz(p?YGU_~@I6@H=A*3$Tq4hI6xU&rHKNzX1Q@9{6a{T!f$d_(gb5T|nUY
zDWr}ZLeJ6t7(TiO6UTOA>cl?0@wLyXUfy44T6w7Vq;q41dkT8;{MmbV(Yd^U#`zf}
zW~Y#xnL>1Qlx3h7?#2{6GB%GY&qh^icbQb1UL+d8&Rxrh^7@(0X4Shd&|6UePSk7k
zgb^)VW;SG*(X=gF39lU0i6)P2ql&Al(ZI#?C({VUl2z8})MD+jpp+z<C={#6!YyOV
zDl2PH#v8e^svgyRjh2;DHPurgZEUU9qB;sH>I~2r9kjP~3V3-`QvyKA5+v<v^+qQ&
zgizbSG-h|7U`5o4#e)~&h;~EcOd{Dmg-|oCtR+kcnF&TKDzt81&nC<rehec!k3t)4
zgek`RqjLh@)=7eI5}uYRii9cX15L=QBg6>WA}Sw^BZ3Cn<*W)VL0FFpgA+NG8mc%q
z)u0u{0>e5Jp(|tfs`$5w-`~qhvWxB~iGpe))~4mE_Bol6BH)s9mqF#bNnj_lQmE8q
z`rRffxs4KA2{oQC0bg!$xl(?rN1RGot!zl>rEgn-oeBjxUJpL@NQq|<h?xRJezvTU
zlTEcQ*GGUFlBO=&Tv2%jqe)$FGw&Ol(GE+E5!&2pXf~I?K-k&Vl)}EMn2N6up0xzu
z+B^i-<{+{*8_9K9Xj#7vT~vR)>j*x^I2GU|w=<0ShXCA3`0Xe34$<Zw=JxOwf{-vg
z&WhkT?fp?!jHj~-aFVwFL}n3=W|iPTP8oLRmSZWe0;?p2yPo!=l+Y`$*FbA`q2A_(
zlLgEiic_*DpbN&}CDgo~l<ut^XrEm~sH+b?syk7Ae!?r%-mN0c*Ts7V)P9iQ^Yb-c
zzD8c>CGh060Y;cWNOcV$+0n=7{Xiim`MyA73l*ibkMMhGr^6wFE|^3#LMu&(gd*Gz
z#tFGNlD;T|KqUAgRC|7z_@3~=7PYIH9-cUB<pgg&Umfb5<*4<r)`{0ao2rDX&%~Qr
zmYCF`!BqvVsTyvZ5l$IdF3C@|MtDswm}xWgtkEki9%RzSYR6|#GD@{|;u;E0-NCBa
z!`OQI8Z?jHg>Gag?Q<6lyHCP)@fJ!Bokq^&BFdUNP~-Nhv{|8$7aqc|rJZ&$5=Dx?
zwY{Yktu4)jU@MwB+Hn2uofRZelD+luchE65jySDvY<L*a@nM9f`w^b!-}4iQF3lmh
zb2~!&_aJ;|FJcFFqvPN%TKwIZII<J7RC{yB_Tyt;`GQh$E93rza-#SCpU{(OXs7u7
zZHo(t&rKsr;Kin7ostojfgV<djd08G6hA8!2UVHF2w%XBfuTN3%}lD97bY3EMexa(
zJW&8f+Mm+$N_8X3w6nEq)~Yr=>A97`$ucWL?AcaXOPO|U8*A+927?0K*~hLSoM_=q
zR;=JBDU+n#xtf5iYcLXcdejnd4i776!b4_UlvYb(TO%r|rs@rL-srs9^UKvbC8G37
zYc<FsKua~WU@S+S6H6G|brim~5lrkmOU1nmN7E2|9pi}g&q5z;L7~P;co9rm-XrcN
zv`+B+?mLYFy$i8{8I0~ejgG0q=wCd8!R52u5~8U-6zK?E+TUs$T}-kEmF6IF>nzBr
zGh<to9;HSX${1B<FB%*{<j5F1LQ%#MZptnpWNma_A=SoLZE(?|meJN`A&sDu@rx@;
zZAGm~yPRaAk%SDJ)5PYcVZHRy($Z$+h}D&0EXB0<RMiBZG;Im|wq^31|3*k#si-`u
zlmvVdvM`uDUQkG2(k9C_+OR1N7Fyhv)^^olB}3T+d~(gLW<pP6fxe^xn)C|j*O$Vw
zx(v2eC2%s_tBc`XQv@F^Z*WZ>)m|=A1Yhg=Oth~{N6-2+3~orr2vy!Fp*KORJGEg8
zW@&eqH*Ldi+T4A#yL&ceV#n5P*umephk)NttAA+oHby26^7+GC^Km?_2uHRSVfWSo
ztdfk!78g&rMvHnDOj)nOM$j0k?zFUJ8ebU3ND3|j$w5$A2(Ff?Ii#rcLWH1?01FaW
zk<LDXj+R&8)iX%=^{NQ+eSRuFZ*vFyqW<`vFkhSK9z;{$2pYSG(A+<UCPut#5Q&Zf
zzP<~#whowSf!zdMG)5&DZ6)j^ryebGw23N@j}wHkKmySK!5ECFp)ejo#T^eJkdijP
zAcDLZLtY2G4oQEbh0a`q26F|pwldb<g$jmRYZWY(de}``cxY+ew6rdxh0vqD;SFRr
z(v~u+9U+uZ{WeT3VnfRi3U-`9&CL&E!>RkIfALc&K63|I&6C&~=)s2m1=Ku#AKIs1
zgXa2ucn+RI9p7t|*(pA^N~aZ#$B;-R(MI?s6LGY*HlwM%1y4M6UjayX^!^WjhPOWY
zHWgqGnrMAnhK7+EVkI%fN@KnYtqX&M(-@k!Poib_EL!$0pkx0c2KFst;^2179NK~T
zBYUuT>;OLTr7tL`tqgKr>9+U)dO}!k<PlMNr|#V5XU`!<l^32GM|hknZ=fF@R(j4@
z91bcw8x@|*XNT2krt0%zY5M|Fw7v$DVI}R*L+A<cWH75N9V#6bYB2JuRUh=;s)gk@
zY*y3GWX|oXH5=42p)~~3X4;m`+wxQ|r_~ie|HvfTx(5&?1U;<q41`>R-VPlrBol$*
z45U<;2?&!jjEZ_Q>ZqpjN@~=Ab6+rl27`@issUMLl}Mvyt}r`MsI#MEW*6T4+8?5C
z$8nsz_Zpu5<kzW={}|7`^L5<)$Y*f+iI;HUsaJ9Q#xqzxbrXgNugTbU_`Aj!<7gh=
zjh69)h;;8@IbTMkX9?c6QP>i^Z&-#MscxhO7vX3eM4sM*GDjGiP%~;gaSDrOIH=}i
zOkg=JZFZRkX$4hk++H57ZWWKcj0&)rFqeFJ8+n6o$q+j$ZFtf#kwyrvfbUT<27%rd
z0&(SeJ|}jV3Q^ihMODif$HF4%z${VwnY64*aSQkeQ~{iXTK1J8(34qO0=*L2MHvPp
zZKw4OIt&es(8bW5QQcb|kA|?YZ17y0sthn>*FwLk0+uxuaIUT-^vdC8cnCr7>Js=@
z7b3JKACcABh_A^+Gc9l1dV+6b2KuS&h8QCow_=PTgPA7?!WlldK(H-r$;2$7H_vmj
zo!i}8GqIOY+{5R0Z_FeVv$20u4i0S2!`>}<SXEwKMWx3Z+~bAO;ey`bfj`ws7|_y6
z=3#%71-+ZLme3=-%!E#WP-~y$4Z*FSAQZSoyZRAnC(znx(>nWUcl$or3it$uQ5M!j
z-!PK>BWM~LM;AY%eRKwGjOMXfG>^`qX>b~@)?R1`Q!Q^WOQf0VjfyBvXtLn=o4erR
zF}M<~RCEj~KUq&hI!HXUy&m3>l6cfh<rh}gH-a$LnU^(>!zArYW;k7Dc&Jc}+Dce0
z4YaVdR8|6x>cqubN+n3JJE&B6!6?y9R#{ceffC+i_2W}0o|?z1WFJ-y?MD5rw@`ZP
zO&DH&7kNh?$HrtYa$ARyGqVRdM=zrE?h7cp`~<W+j-k4-2YT8ze=?=UD>gO~+{pw|
zZ7pc-=5K6C;<0--RpL;EBViHarH_6Tjnm_B^>m<dXo%;CmBv^*QWI_Hoa#p3>;U=}
zhS0k-!W(uRgS)0Mv3m})`<5_&U<Vcs?Z(oP{rKdUKEJ{?;hg$U{`#-blZ~jsQ@3xT
zb$%ML=`n=HhY+HQ^LJC(Q<XYHL0CPsFFcQK!cU;r-qnU#T49-LD8N%awzRfN<?)=#
z{B5yGGA?he3`SnFT7}AxLaX}-dXm^PEi+G9+M@DWmHTcLEsl(@^HA~o!b!B!A$VEg
zHMjR8-qelO?h*71&%#B3G<J@{6>f#j9)^=3F}qnIrn;#1oUk~g^M%)tHn_O99=TQ3
z3VKDf!cw8NP4B|t8_(kS6R%+W$YqQkdK?FDzKAD2_GSFuKl~7{f9ZGeg+KWl{K>!k
z7~lPupX0G-K8De~r_nmK3l3V~bN4@tS3mOyIC<kEIC|wZ9KHH7p7_vvc>WV##*J6r
z#=gg%z~YG;7~g-9*K;ow`*sZNKa1||N6^gAFvUC3J-Y`M9{<+D8l*D>c*QkTvNEiO
z+e#{0ITz9^yD43U^lTyQvQz>FY0KNNnf8_-d^8kAsXf^jrI$%N%X7PhYFehF$&eoD
z<t{2NQ{a<<+>ex>8rYqeug2@izSv!XUVVK%%6KiSsgz4(+#^+E9)X#NC6GuYpf~EF
zHRxavrOz^8H4=hUwB`a0Ok1mASxexpsfJ@!HC(G|sP=eWS62{xr37CQf~)foUX_K|
z>TO7^%|Oe;c0QT5Ci8U%XmMp0M;}#R-zGjrm=C9CVuIynmhck*F4O96SCNfnTHkp-
zHp%BD=BNNyRn^v@vbGk%XdDKc6Go?(g*vW!Y6W68LM7Haf)JHigvu<?GlX#OFyf2=
zf#;TWESftN+~R~>f(5dX&&f8)_eco*MA<bHbZw(kXl1lb&<^a{kHMuqtQ`qETEL#=
z{pelX2VegL^n)X?j!weD0_%x)A(-lcJJAj|0c_*<*%~`xSJ4hTkHL{@fiuyh5{jyY
zg=kqKRD&rhM46uElip~mHzUEP<_8;SVObjmsl2SBNVHliJqw&xn;J~3=VLlizB*P2
zv=%yd1Qx2$)XB539XSc(;j^eYcnPNKA4cu9*O7Vh7SeVd!`kK^WcQ7s`oL*q(E3z7
z@iOXeJco+Wov4a+!N}t?`+bPTY1w6(UNVkoYZHP^aa?|I`-3*OpUC<FgpgR#H@^HO
z7>B!IpB_eRk_x-O7m=}E#K*hQKHiJ|$pH*b4`66+5JQW@7~MXO>0Ps!-@Aml{Zx7f
zcVqGJUVQS4pOuePBwa}@Ab&Ar^N4@k$PAAY*RP;uW(?7>egucQ5a?~Cs%NDV4Z`82
z%5z&`_gG=3f_6|z&n?c;uJby$T}t6SYRj|P9MJ0@&a>UH0c&Yx*REcz^5021C{cQ%
z{3OeYSfzEeE~}{aath1UI5tsvX*oqIsx>k`(n8B>L?eMWJh_CUXRcuPp|jX^=o|(n
zcOud{gm}j&D-f1F0?<#8#9R7Uw*5*G))R1bvbHjnUIC$3X|SM(Ac=I3Qi1hh=lMGr
zJ8%&l3&+qra~Mt2hp_+Z^LXX+zlZ1F{RW=><X7;aFZ>Q({p|1I+=Dj|?w>>J^d2Ne
zmT>K*ckt<N{~bR6d;g;1v%mWf_})LikH7v8;BS5e{Mo<%0>AS&|B281!QbQFr@oGx
zZ+{Wbe(u|N>fNtn-?gWaq-vZX1Z9D%>`H@Dd%3i_g_T;Bw70lY5`2hd7x?LvO_h{c
z(u*oRwW8=G`?Iw3tqd|=8N@Bq%Vg$7CRLtfu*o3!q%$ZzL)rpmEMtKJpR9RPR9vdy
zCxgCa?5^b3Q`1pPN>N!^sqT%+sv6iRj%0<u0%;%QYxP<k2I&MGE>U?#3MrEUpT)#7
zpdtK<bugvXz`U*!c7o2ns+MZ69<J4O{}p_+yJB@CYeex0_|nj_b}QP~Zb8@D&FE#>
zkdP^C9jqwZ`Fp$hI|nz34Nj*LOvfZup3Lc;-nb1j+@B)ohBl|6pXzsb!zMznstV;5
zm8`Wrv=3gGoj$5P9~!J;a}%s}l2mg&2zOCoP?0r`Orv#Tjuv)Efi6lgK7yS9PT5sL
zQlh1AR9V;*U)wS;hOV(0^i0m9V`3KFbIYtH521f~FAL2QYrbW4Za;v5J;%|wv=7du
z9f<5ZM0GO;XR?QY9U#~UJE}%AVQX*dfrDXf63b01+|&k3(@M&CN1_=%!q6RWhKtY^
z)oK>IP9<xkl{eV!aCm&E)#_<|WGz|;%%bcFJjwNDlCfn@J7MR7Mb8VScR@!xQs<yb
zXd6IbdmqfpJ5fEm9d&!oBXH|=Xf8cK$*~J4*tr+AhmRwjHFo~Ni>SK(JnGNhMS06O
z8vImA$tY|xzAfZOV{;=Cw6ii~=i2?d3O159O4ObDgL3ab84uq3G<+xb!f<dArs+}m
zW~LEZT0mrO1}#&Q=$V?p00A{TH;NHj-Ld6KOz)h-++Nz-eLFC7a2Mtd@4?5v_-R6o
zANVVkX-}z1^-qAOq^0^xf%eFyb4ZQ$BG}&sZ&#AQi^7|bLFo>ql*QUwoIF={Bh^+M
z3p*BR9sMiwFllK8cw%d%Cr_3El`(Y!ymf0=RGwH{mDXJ)wcJSCyOKe8gJeqHoUzgw
zLBPqtT>;*ftO7a_A8NE_1wE@f42wI8k?EZnn%ark-N&)x@CCFE&g1CCySVfGn>ha1
zU33i2(&k1K^mHZ{>NS$pTZc*+sZiU1+~Nu}&>DLP$C1S&7+gGtq3x$x5g$P7<Y9Ep
zoxs7%FJSJ-HT2FOMWTNZT{DN!Idc^Kd|lW4VT=(}-p&b(vMgMD@JU>I{_{9|`9nB*
z{dL;mw+O>e;idP!h0px{U*ns9`5*YqAN~X0{r*4U+yC}+eB&Q~geN}vHJo|&qu6)#
zDd@uOsCC6rP;Wy~odsnL7BrZoosyQ96~Wdl8JCv_omG@yJ<p8{N+#S0HBo%hgZoJ3
zDWxYJ8NUI&>|BD3Tgkd2(36l(p+}0ZxTIWdrDt3eoMcLFXwa&P%jI&bL<1U`tz@(+
zwU<q+oW}3S$;?CtE4@I_OW5gozE_}UX65IgHMf@=Vcu2;%X&g@Z4JR!Pn)Y@G*J1~
z!nLLf?ltA`uBFvoTR^LugXro^B-W&(Y0XxZb-9Dk>t^%_{HXi{f}LwOp^NITi*~n@
zk9AWO_ODOF0AD{sSdQ^;=?od8ixA)qu;N`+Sy_SFx;p-VdicXJ*4_c=EVKgLnmv-b
zDG5s;24{>mv$<O>$0!i=H+87ah`6-R(b@{|6x;~9wt;bU3{9$N9UMnf|0r6ACeTe-
z_DnA*YuvYe4|;a&MaSHBw9W27iU3K@?Lyby6X-i}0p8vF(RBO_ytKPELeJkis8&L=
z(>~i6_NHDH7TOyVEw8?%LxqvBGbP(#p!L<qTL{HY7zjT@q7hYYFX|W#1bl-G9(MVl
zBM@usUR0VLRC57n<#8Jofm|@h%JPhMS|f)w2%RmA5=R83p%ijFUKGcZ$c}ZOq<0qi
z!*j?RpM_@6UX;u)VDrEPoR{uEE31?(AB840pyqqJV?nAVH*J~^L0Jog5IleVGJgEa
z6-%o?B=C`m6aTrQ`d<9jw-CC19@fjJP_r-x%Z}Y}9y|`mo<s0$-;KoLGH<>)3@uD!
zbZH9X+ov$Ki}sdEZ+7ppQhRfUb`g5-5_RuCgx<sNl7GLU^yHra@!*+bND_F#4g#+w
z0$<7xU&IBEmulE4Dc@OHSahsB?C2lv$H>G8Jbs>ADrTEhA{MJ^(3YE}ytEvf2sQ;e
znRZ6-iPDp;C_RDRN(L2LT!L@o)=Xp&bUKThH+8X^qLp1#si0TKN=RpsG-i_3IEqMP
z7Y?3$9FO090mm=hQ$4S@pZ^%1|LEuO^oQQX(et<Ard@RSlRRD@G*o-F^#;}3TT!h+
zjX+T6K)h`j3;WLF>XRSExBu*aVrKtkbWI<{?7{1p+<S@cWFIV%e)wB>U6Uj5woW6^
zIZrUst?W334ua6%F$Z_k41(>;aHjZt`vMY!J20^0B&LsD#oX!Jxcd4RaQ~g};46Rk
zL;Nw---}=SE~d`jLF@8y^zS{3mg)V-*ZGiHX+mzb38f7-x~~w7cAx6Rk*qEmIR$Xi
ziIr5<V<WAq<mr?4IvKhmkdvNV>AhW<S@CdM85N<l<H?jhnb{(lR>bZK03{yn%aEbc
zvMN*)s0D?^&>MISWyq1$j+6{DldML(e=4eK)!1H9gPS*QLK<y#h?QQ76}Z_>H)S`g
z1f!z(9YzNn)fQNC8erZ+mAA1Dj<p2d8XZFm=h_wUNn4+Q&$FhOR<{6wRoMuw%AnOv
zN0Jb0UbjV!<!e`vOIPZzb@e8+@$nWay(YSZ7Q(Nc&-HH3L?5^Pgx&z3m!9B3g0F{_
z?5gteQq-xeDGj8z8PP-vcK#@tPv`apSfk5;-$u9yK3}vMk(M6TinNSWT+#M!0<VvD
zc16jx4HI;u)99q4>!M}t6x&MZH8MH~yjDKmI!?8>xD&m*_Mvyr0kqHWL~?uqjk7z^
zec&WIkDf(h|54bf1RM7sgR^@~*^W@#5S;N27Ao4?RL2MF;2Pot91FgNAaqfM+Nlf;
zRE4IVQ5d_2p-px|M^M*N{gqJdm3jlH2}V)v38TstL8T{-N^c4ko)pU63AIdQwMRN|
z0;rX?DT2>t7t7{Q1+U&7M2*!CEkUWVyV2kcp_bsR=4V%jqNt!{EpKc@VMjZ(^YgIn
z+zsu(D0Bp+(;ri#*8^dhO%jBxD{<!W^Tg1v@Uvh1oC<Hna4H(D$UfDR`s7!?4bSDr
zQG4zz%61+=1MQsg%6(K_xQ>d`=U_fYb-ZgYdgtabJWt^5oWaQQIA(UwVAsB7EbW}f
z;{NUU*k?aUt`TVuzej!72mc<)y%i03^yCpVwlyNw97iB330vK2=7rs1;Q27X!kXOf
z62%(8;`SvZn^<mSl{>3dS)fNzTT`Rv!){!^Vr|#Z%F3Ku>AX<0Fj(`hq<epulAJa~
z+Vye^%BkwCs;5#-&M&Ef!xvRf#9)>7IhUFa8HlDZI<p;TF5jc2KY_y+?qO!{d0Zj9
z4qtqLThDzA?SqSG>Y6}H*ErowM0I#Hm^`Q^?DX~!oS{}sEFHnQ8_(nK{y)G+KJy(U
zdKNIWcoGNBKZWs~=iy8Y6L^!b#RlmlhT(0V!tT@eapu;Cv2f%T`WH^a8XZBja~XP1
z7aHu%Fb6vLSU0RmmfxWz#0kc(LsxL=t*_zJKlpbXd+y^n#N+JRdl5~GC(z0`aN}i`
zX~MO03Pu?iZI58ziHoWSS~?*zXeC#YkIDi@29@r9OIkJoCq1m$s*O~#%%rCgdRsQD
zp(dNRY@r2~VJkVvDv(~+0)AEj|Caf<#cE%s>1E{=p@3j@QSg{74zzW4V{l|tB^J=J
z0+aPkB(Xr9M#eZ+puRyvMOc6>{2s|xT~Sen&K|lkw;g($8CvPZCr7NjcN`iUY-I*$
zww6P?t_+sdyq>EFJE}bw!RJ~d+Xi^n)>G}(5PVd9gr9eH9(+`M{xxX`tl5On+6{=T
z+lUw;CK09Di%{)_X?a5uvd`DWxS!zjDFU*Q`^_pgp>gd-tg5M|(yOgOO>M2JnN{+D
za2O7ULnZ!{%qd>4AK_>cp=cB0R8{e|UL-ntk?QV8bKel!21n68JVD@1p<|3eut~HN
zaw)=4qK%eTLR4Pw@@@ieKiZaeDd_dk)^_hYjOHB&5#7z$e+038M-W=rO)E=VOX&HU
z`ruA<Qt|b{mS~5wr4JS=KW(%P_U=(whNckTeFXO51-Qm(jcJojV@uF>juL<qFbz+m
zwrdcDw7Ug?IBE&->Ocx*1YVi18Rda?l=)jx>Tg83FGd*B{(1;Cw-XxP%myk%je{1~
z=7ZTAgVE-J-pq?@cf#m)!%F36;cN6x0ihoiVZyO72~Be|k1@W|F~r*07l^>kbL@?T
zaQ5;A0`Hgj>HE@kAkg_0{;wZW-U*H5Au>qT&UohAe+2*SJ19JP5;;6qC5O+W>c-P3
zz4J86pLha>E0>WvbQHZy+aIdE2`UvTy*)G7zC4Y6dk8)%zE6JPU4A5?rq;$#U-$5z
z?5sdfl%T|sV~1$JlZaD^20~tyOh9H`SZ&$DM-%`6Y*9%>K~y>d&%m3)s36zW(u}3;
z%kcVrYM6>72Q~8MkTgRw&asj=M+Vi#I!PTxrKc<|fhTiqCCPvQPo*rP+S?=<R9G9!
z`WV@SoeVw6E~>`1f=U!uX<1P;t~9?`?9dvFsIIF=1ud#5KTkvky>}~%y64m_Tz>FT
z96Wy?$1XjEr2`kyH@*vPeXNXWcY`wK%|i#{Z&J}YvWSCcZ{b_t`v?5apZyDgM>QSl
z#>CFEnA~*^-BVQ5%@Z&Nx)ErdLaLwVnLxbu%qK8^=mxr{k7ID@92M~_8V7cv&f3V&
zrQ=|^awd8(y89Fcs3L=+3Qym|^4$+(`Nk{Qefu@coPQFr8J6|&{b*h|iTShl5$s<^
zji(jmX1Wk7E679#Dro6N;pOD5*i%Ukx_)yGO3Lf0+HzH|cg8m9Y@mwUx>XG>76mOs
zSJrRZtU5im$t(($_@Gz;FSn?Iz$;X}<Yt!b@|p&8^b9D+XJ$n&O+cOioe9gUB;b9t
zppz`>FN^%k;$O02Vwwz**|d==F(2_%LQN6X+bq!VwE{g0p=Z;v>{J=hkX;GQ=5m;6
zXYH#5eDx|;tS-SvyX#(4Pw3SUdX?OkF!C6ggkhSRv+E_;0)$zRw$;zy?RyyVxP+|D
z;a#(lHhCk0j4)v*V2nPDRrPgMtc@#C#UEiX=v4yH#>N!<J`db3yV?c<l&zGu(L|FP
zGSJxBi<aI&+Sp-Q*im$i(z=dKqI-M_U4&lO*tCLQ+sFhO`$t$1Xm6$Sf{%BKwWZRV
zI&m5U`wwAw?_o?H=Kj88h|%V{rx)Q{+Jz*+XQoXy(84-f24E-f98_)Iwmw)JJ84}z
zpo@3G*gODD%OHU^2Iu@f*d}+Q@$e<smye;SYZ5j63vlvA@a#E>`pF&0?;JsG-y|BE
zdr%i?BJ7eVBm64DEvSmLqB7Ko3aZ-bPz0KI9Q9EuzhD3rJ|AiVVbq1<sBrqB55<%k
ztn>Py^#@tVL$G;*aJWNiyqJw(HF(1?P&Hb75!Lf3^OfDvC|r>+E?&J%%t-bVfs6o$
z8thjO7yfz3AlaAI+aLViAHaU=D$)+^L*|}+C_jA>4fmge_W75gf9@HWE<c9wo;~PX
zn8(oaEXH?CV|wolrngUEaee~Zmu4}uV;*mR?j3#?Kl~B!B-Fo^o?MBQyve`7fx`z7
zOUBf+KTp67m)E8SA)Blkh7M*cl^&rtJv)P;kzoZqgJ~sao}RFiIhc}yOd7!@F@TK4
zlUS=(Fk4UH(cV5(dXJ#DnHG2RCdvMhfy|r|7+AsNmDZ^#S=Cg1epb5D^djpn*U*`i
zRkG}I-$dK1w+9eNb+hbkC+HsFL!bIBs=T)`M%&moxfd;cvn;dI=o?!`$G|Lx=k`$*
zv*I0^$FAd-@z&?QjX(Hr|HgB7QsuA<w^K3iK8J~2XA$n0A?zjzJ{gd|gry@_apBIZ
zxOn$NIQ-Z%7~ORN^G9#Oks8DDv0Li(p5y>*;ZC#+FXHI6XE1U23I+~cMc>ieXxn!c
zNkX&#;AKpoy@#HCk74N0H4GlOf)n@O#Qdo{2y{!{ydIc?EvPoQRjR10yi!Oa&`h<W
z8~1sRq(@ig-pY_3X$xJuaqCL&tb}w_@U@wFC9uf4Ftr90RyH7$D%!!aDfT|u)Pe5)
zVdzX&H4LSX_p_63$Y8OtQm0zCQ~a=uOGpYSEg2yp9iS4WWjwAdNin}g4Q7^QJ)2rv
zRGYZ=A@t0wq^yLVt&VE3uol{k3YgYcz$W0M;<K+7Yb%`^1Rm9&hY<9xu7h_~HTTQ-
zSRuE$j7&y4V++AX$PsKF34+bNX2S>JTD^WHR;`0)^*Z?0tVfWKON1D!>T4@dQ%x&D
z#iy4v??xSxi6~m=&0P+&THqxS4<nUIVr+Z@6EkyY>Fhy!{~$UChtV}Og6@%V^o$Ak
zR^TTPY#$z{<sCs&-!QkMw7wGxf&#<N>3O6krjVSNM(gwfI_GzwV{tc{7xy5wa~~S_
z9fo^q9(7cembL-7+6G}I<ZMYsQzxv=(zZ89Aoj!DF^1AeCyXOYu+Hp*Yx{8o_FsVS
z;3Z^rOrv^YAKV8YhwJd;sF>N0qOqNP?i}i<7;CBe$_O!8n5Qz@h-&^_6HXFxw7gVg
zweb{+gAo+dl2$je@FiRLo)%OF6Q~TwQB4I`MN3;p0LlPvBVlO^CSmo*VGFIag-Y1N
z2{>4L+Ja%;2qA(mg5UkaKV0FC8b9(&{QT#V*Z5cX@vkHuJi$kmCt^?1NnQQ=SE0FZ
z0tKfJq2}yqG(2_*`X}zg_WTR5KYbtOvnSwRm_ys_l$tL)wv%dac}ziXFIC?1_IcH|
z_a>nyczyq2&+rGKHY?a$F}J_M-b4Elic^_}{i=7?=Hd-NYb#l3Oja%7W=1R#R|{{;
zGM+{$;19v~XsC*;XDdB}k`79GXH{p#hRtelvZQ$bFZPzu+q#(wg*Gi?TdqpbDbOpd
z&>+9O0VZb<PCw7J&aCF|R`Ih-E2>r&ocBtF6G61O7fDuX3;Ryv<}+_#=h4e(>7Pgc
z<Q@!6?WN6Kpi3FS===fnOzgygvp2Et><zr|u`l4azxUs9?#@fJt;1^0Z2$ZbY(IXT
zmGv%on`rSmrsy_CFt+_972qi~4K30!fkYq6^3V=U?0*dLp`EG|WMs!FD!|A1*|%`^
z)(d#zL!ZIp&wmPs@4SJ|y;l*LIf?KD-OZl!m_BnC`)<C1Lw8=sGw=QmPCa-F@!{Pt
z#rjcd458i;L#^3Q=#?YCxSGI|nGnU;D81VGwEnrJ*hu?dOq;ur@GGonKup@+S(%p;
zGOIUb!{$q1|FMgFtP%AlFWP#?RI@<6#-M~uB$k3V!0*!Pso-i*+n}YQv#^qrbYWC_
zbS@4$7|Eg{*H+TQ*EG~qX|pn_sZl-iqAW#W%HORW?Yt&#R!{;xn`#fV=q<2mXn9$Q
z)n}H&NVR9%Pzjs#=B}-VLv>~lcm$u1R@X=P`4sr77-a-q5#g85kc0&pgxwYeJC{K2
zVK@mo$HTA_c=ETC&~vlAc?mv&pTw%#%2L$VRzllQ1B0%C{#%cr&xxVlb|k_cv^PhQ
zi2Bjg7{~GB$I#Q)i_YFYboTe7qkjOM1B0{-LkfPs4nnSf2+fqr&AkK4{z|k^^+_~y
zKSp3i84)VRKxaQZ9kj3AL$LP_!_qqhJ^yZK?0~(UH73=bEhRm+-Bft3u(tL>Pvxgg
zbU{m{*U&x!9}BW(cnQY&gJ_sJ02d2kX2&dQ7mveDyW>B94~9dRP_^q6j8uJPO<kyp
zHKHs==rJl7RfL-;xq5=DKGKYOD$7D{i&LGb>l#K$tR2O^Mray)kuTtDY)5sf9rZlU
z`cxaL<ISjRYD0O9>QO!~$<Lu`*093R$7vZNw6@Wh8XolffAB~6=})D%LWr|MAF;VV
z=k`YrI~sn-hb3R$l`p=By2Cr6zjP9=%NJoha~d_LPQh~Z3Ov^zhv(QqB$q@%Q*~{h
z#^B;8CU;C?W_c0|OEZ|^<HNh=@%rcAQ9xDy<ST`MA3*Ps!Cg_9dygDOARdJ~=!4a5
zQ$tgXgq_Y*tJI#&VZrSDEGDO?6zCqUePNK^Z&3+G3yMoClroW_DAF@4jps5xYz^&(
zN)7)IdeVa{Q_?chWS(aZ?OOqg%j(qZisCAr8jvdk(hW8bs_J!Wo_J+-Ey}p9<Be)}
z`{D9Q2A@WF!c9o9qLg0SBrC)rR)llAPgB($RM9=Y69cpRF}g$>eEK?0TzeXKUil<G
z{mnnc+>uLY9p8;`_birAN(aU{#0j-f$0U-ytn7!DsQymF&oZZV$7wx-s4$0MPYqy<
z_H}sAS%iA$p$)X7#*rXAsnn?Uu0Q{Av`-(!zALX{^vFE~hY!KqzXL9+xxLq3z}>gL
zhEM(O-{S3W{S~H<-Gn|$XVW->Dq8{#t|Ut8?T{=mo43*47M8E{;!^3Q7geIx>P7p=
zEL`z6lr-2dvHcKUdHeGin%xVnBMOr@g^{@fXr<CM^0PvzJ}m7yjyB$N0an;0WtFN!
zqK=>|DXUPkW{fh_$mxPx>@9)k;nqi26CfC+2UliY$n@3f`g#JgK@C>Uq{>TAOH*xp
zjm=F6(#^=CK?Z5twK>$#6{~^3q`IujD~CS445rPcRD7kdtSg6gZ6(21LqLhjYk;3t
zS3<z&qZ$<W$zW#p>Qbt>e8OxioK$%d4#MwI*a<%y!%E=U2s{Y~p(h~#bP4q8tIA-|
z)WT-c!(!0FZ8O13h1b)Xz{2Dpn&Y&#Q6EMIyK!LuZgh0Cp`*JC?Gn9Ie2=29kJ0}@
zwD$BN+1^Df+=G^$el&LVA<^E=g4lx?Biz*kZ)-a|%`!!;9qwisb0<SjI#h40u|;}i
zWlUZVjIlNaJa>CPVb=<aJVyvxTKZsYA422KBWT=v61J(`aP2+;&w=x_vinfbvj8g<
zpX=le_|H9oz=iv;pSgu<TAQLoJ1U}tQ#^t4WE|zZ!K)%s)Kisd2)e3B8;WV4YeyGR
zI<kN*&3(x097PQ+QYr0pY0n5s3Fv~xE|gJWl<_zV3Bc0!9u)C+6*sq_j55EXtr?ZA
zji_x+LC>wNDa8UB#~*z6`$P*t^K<D<<@OiSx+nH_Wfe1l&`*B(OVy@#<r8nBbhH~a
z1e#<29@zKpMHQ{AZqIHwc}@Jw^Hj_e7@VEJ(Be4y=Y}x7a|W|JXmw}CFuQvR!w0tG
zLtp(Y{x9_{{HG|#|H<~zpA2QM8T*bMLxAVS74pI2w!&c7v*vDq)>xyU7vSSZj~`W8
zutnvGtu?WLOJ-!f$-<jLr=}`NhLjCkwqaA6Of7q;Qux?9Dn6MtAOn`g7G={S$skx+
zphm`$$+C>~1_w$Ayeh4Y0H(^O%9DfzDmezNtt2Bg@%wyK;}KS10jhLa1YX+yqAV{x
zBl8#)8#}ij!wZL~(vD+t_bCh!c=LxYVus~pC&72_-YfX-KmQmX`rLOgb?`AvvvNFn
z`z3S|n9ZZx5$&Er)8IU2_MAfB^d2?&zf9{;$wf`!RzwJt?xmv$4=y1xxX5xkh6-B*
zhEN-<(JsUZ!f5X@`j^k6ef}JR1N#u@+m2xW5{}a%fAs5riWlGeJ^cPZ{v69^?!gov
zge5TuJEPVXS2^_xtBlAksY78oL8G@L+SZThJx6io#*?^o|0T4tj5fI9h;>gQ-ZKrI
zhb~6vL+9uYTKhTdI`TLs`MYa$US5YvY-7ceFP$2hS!%Gc?8{bq-X)}MUy=lSJdzQ8
zB}uSsorIyN_*#t?)eRcet||kYC9k2h-O6>8@u#Bp%+l1sN>HtJYyfURk-s5G`94X`
zt0}LACc6~+j1uTJ7elw9nD8rC1D9<waG8*E%lN%j4@2xPt*&Er6(PoBTwMt3s;#iC
z+Q|K1hxJjc*#rx>7RlI5_$dGqgaSkX<0?a44V)$e{4TqimEf~m5b-$B6!v1r%qaTW
z;^=KlVEg<eCdUTR)7_5N_7>I@?JLpU!RS=c+SNttyAqA<9S9{;2&bATw_8<4l|Vw$
zP_-b)2(`(a+!lDbA877S1DxGdOO9k4oV2hGDlr*+Y)kUw(NfC3gRnF5HD<z5W^EX#
z`f6j%@QuwOFtG&p<T4yf`(c>f4F_v=GZkOx_+^-PpM-AVFl>9zAV^D7H?<3;@lIOW
zW<rlPfJ&?)MP)`cR>zyCHqn8q_F)tc%&iPNdHfFY51)bg>OFWbQ!ySn58d&zC|TTv
zwDv(1_fMmWH&{LaT+&0k-8qPg)*e*0bfBcA8HH_)$nQ*{peu!v_9SYWV)(}Q{zO6M
z*YKgz`{|DrlGF$~dF=gPDDXY~)<@yxxwi~c4G#_R_fEn$JCDHpB4TqgICc^P<D;0E
z9>+MLH@-{&P7Pv)R(G1#wr^ny{YQ4;*{^(BrJfhttiD)TYdL_G!w?t>@TB8-A631N
z_SWWi!RRzWFH`s|^;CLwQ~^fx4fJ4fd6A0Fpj6%?g=c21t}&QVT2-sEaBSF``EV*(
zJ{3v@3d(8~{3HdE%p8z8ozf<xfR{y;r>001Ag{Oz8kvM&Z-&tkq~c7dxwvJtU$Pc+
zsVrac(1zI@E;!vj1j8}a;TECdZtE5YGKFYMKROA7`Tb`xx#u*-cb~!TGq<Vau3^vl
zJD8=qyZ6!0;=BL)Q@ry#Kfv`@-@%m^KaNA!o*|?NvYsiVSPr^bW=59}p|N+8cGHJ?
zYXH_zGxR}z7eTmm_BIw6uFgpU@Ca;;bSmB?fjJ3{M=~mRp?~od7561{Eu6;K{>R~J
zo5bv~>v-j}zlR4@dw=pTzr=wnFTl|}4hzBOObx>n=zu5Dhj250&(Iu(r*^PHx`@3e
zE@5b4AKF+c4=x^L9HMv_qTQau;*rbf7~ex+CeTU<yCPlK&EGk(aFA*_g5q+G8n#kc
zTtszON@x`+TPjH+r1MFZevs@%vdn`3OfC5!9U4@5E{7}+;pR1_+SBON&=ncmC)X~0
zYdW&FWg!}lVtRU-(DW)REQ^B68X5vY)dZr|L04W41IvS!@Y7}xfLqI{{HkExSOeSo
zT7rvPhJ7ss89`{F1vag%f?-VsjBATvp~bbW+6?>ZEh-#qwi0Sv;aHajJHxR){b8iR
z$#88*SK;26fmM<>&dY;|$jW678-gsrE#Ux~f*!Qd)*jqGgNgnQ?3kax<VZi-s8-rL
z2t5_8XzyyL>SJ_upoMgAZf{3ZTbs(3;^9ei2Sc<R5jgxoI4F;Ow0;uqSOVT;6MPI0
zLE~W!t)`T*Mt2cp?gRlyC^-l`CqZaSw7|^5Y$5clw3QBm&&bz0Sh)4A-Kgj5W&PNO
zfe940^g!D)i3S!p&yFK7&F!J}U4(X&P~3SGc};z&Oi($|u9hcQBc@Wcs!3Ya6hEhx
zpVNzira_cV?M2<WTd25vAC~9efZ_hj*tX*s^k;9N`S}mSeC9HWX(MdMXfe;<MAeSN
zD4*YriirhOcF~@+^`eq+FXy?*?`=g+Z?oFg^tR)xfBIdes^s6V2sgD@kElG=lPiA{
zNNP=5;2-|wU-8ZFeHXv;gCF4c{*3YGe}!-T;IHw`AN&=5_s{<VfAXh)hQIj!_weW6
z|08_&Fa8LB{NMfr-}}q&<4=C@eZ2FPFJNfbBHH&a;oL`_S9V#3!Kls?S!wMd^!V2Y
zDo>!doA%Zn3Bv55Ri(-k=;^Huw7448sj*|%GE!8nD+<pHtxiw)Ia$cf3VM<XN_q$-
zWqDe5ky@Eca>YqzU`d*}O?nZj4rGp}j3rx1Po9A^THvjOWDc#HSr(HwIF)6!x<kCl
z3<`FVu~*>NpfkYkbi+&QCCe5xN##n*76`?VYGEbZJ%m)}5JFVxjRUjTeflQ$QSI+O
z{{*&k|In4EapCEY;J1G8ulS>X`6(_w_ZCJET)^#*d=`5y-A4~W(lxse9g{mS&T_MO
z@H7nWsG9gM$wup}A#_s>j_x~)(@(vD$6op*8mIT6bLlW512d>6^t^<wJ=Tc|Q;_9&
z5G`Z-;BJ|MyLAFV+S`Hcr|{CJzl|HOzK6g232^w@OK6$c4~@SQ#!xqQ9KVk1Prrft
zFMk{dPhG?K{C+fd^Sw>|bPST_s~@Hy)n?ZeT&Y1cQf;2N_B@VXeg@&TaTM1(u_>zp
zMO2Dqvcx4Jlh1QoKy@Y=RRnrv<>jcYt%Y8%r?T@X*a_%lT3MW~L@n^e`&hu|mre~r
z&q#$`r)4E1iw;SG*PMJ_#|-51y2_Naef#z+U>c++pU@MlYm^0|>{4F1uVsZ*QwMEX
zH8cg4(C1dcltu7n)WWj07UoSgurREfSN4sYYoOc2if|J_xv2z(&3Q0y$yQ<6ngeTE
zE^O&}urh3{U@hsnFssOcC4<jz<MWw$a8Ti{a<hi^Qhr8#K7^c3L|jfZ27E|(>}U?U
zG1k?B!#n4(fBPKzI$O|~jG~jUYi(;n3!}BYnNqd|EmVJvElp}JX;6A71AaLDemGbd
zZC<|`&LPnAMhUxz;gE&6Su=Yg3An>?)%zJF_+0)dydf5taEdox3Qhvi&TtTV4#Llw
zY^C(4<&|DsT1qX$NZ@H16;z2;R9SV&Zq@cy)7sC!2cV&HGjxxmPGH1>SRL;{O}rab
ze0?P?b`9-leN!v+jXiwd5S1Pa?c8w~&fSOM)+>m-{yD@y^jQ?{x`5g}k0Ey9DHwL1
zLFvpP7*1YC_=PuMd+J44?>z<E<JVC_l~LO@jCvkRZ7T~s!B^VbiIUzfRI{S^%Aft|
zL&e2^LL7md0O(=IhJv7iAs>_Hg-sF<_kSiYk+(^Yhp57z@{alC&++qL{e<w6|9*+T
z|JQ%U)Xwd^d1ujca3?xxac_S3Wd%KDa)0?VWvk_%@bwY&WZ|KmhYrFX@WA9E#LNw}
zx03Cbw!~12j;>bBFH!}&XmwbiMd`^b41u0TZ$wp{MrA6NG)|)KvI;6x@1qPqDXP?>
ztVX8>!^(6i>DiT#8%EZ^Q0;y!tT{zhsxjUhZB^q#^;W9Bng-R-UQsPmC+n&5tO|N=
zv0%I*n_Ku=R<6=km246hGKd%*LF>RYBAsJ6b>n$F_T(El|KKAy^TexII(-B0{lQ=1
z`~Ut+yz~2iiJ|@HaQ2xuaOtIwWAEb+FtGDDI;MA{lj`csji=DuHvx^!uQHzK9Whvg
z?dV@Pg0X!U@%T&c;`)a_kM^ZwNKWiRxPJzvCNGL~c9a=hDAl`BYv;W)xfgyaJPCVp
z2#q7V@yy#_!-F@!jL-eS->d0uy-Rc^LAsi74@S11P;2P649vq#tL#toAeic-IOsuR
z_cX$-BWR?$>z&@OI$HeAqu77;4o+Qt2C<GYwbsVkty1w6BO|L&_54a(Ykoliii#xp
zYB_3ZWY$H48m1<&OD2<QUU+kJGny%0Qi&wu(HJ6PaS1)KxN`1mHF}h@aueuD@>aQi
zlIO6A?(NFsSJ2zttJI^E5sgu1X;@Hi(!nH4f(rO#+#gk+p4U}R+iobTgE7Ajrrdg1
za~srPW(iYHJ@h%X(B;%Xn_B}-UKJV$?wZ0<MhWUl%Fw{y)KFGI>r8;MjA&Uo8f*Dk
zb^Oe_T9_K@VbN+})-+(1?;-Fal%i1<o&>jXf-mYYBgtBRxGja{@j)z24x=p<MLU&V
zir{N*PO@fAsu9?U#yH}%hmm*`{)nhMft{DIlOzUC1w1Rk=M08<BZc8&p>zgU!sg@a
z2|TAi3@7)4p*RABp3h4lO0t7sOs#9{kk=57t_W3PlwhQ_brEtFg3pxdggHq7(w0_J
zsnw=rI7<igw6M|{QJ0Vb%H7bVdQhM0rhV-~eY_V9iC)w;^6zH;O)IKxks&bs(6x@D
zx^tFx_%S%IzJP`+&p~_hWdvUM6zWgkL-6v8a2>pclA+zm8l{Rne;1Bd-h}3bS73PI
zB^YnsLGk!J>iZ|5?-+!ky$|(Wy{PW#Lq%UdVc3nY{`sG)*%s=bfJ{M;+ehG&oKgZl
zd0fD!LSFaq_5AbwpAu8*ad{p0f3?ysDCypR@OOVhYrBm8nQ2TN*oWrDDRi*%>YSUz
z?dP82tEA;nx=mKv59K3&M3O!N-}2twDwB$dR<^-d13j&+!J?tHHDP>m6x}`Dko9XH
z_1X&bq`|t5mPdM5W%fW;{>qA6a>H%TEJ8+J1qv%xpeJL^)D)?Wo2l?*=0JuTn7kz|
z3z>Y6v|UM}fFQ4*Gmt>NVI=`+ISaGQ{j8~H0p|C)JhCuVP_;qDd2`BE)?!JrBI_BN
zphcGiw|&Z<_Rk;0sar4OtKa)aeBlrO27mNVKf)hV;r-6v{|64<d;z;IKZO@Q{~a8^
z{~C5&yo;HmmlX86rgy5gGm|%l21yHVW%&-a!4>UA<IoQ5I{$#s`!ue;_BkwHe1O#C
z9>j+iP~(iC%;-ce)oo6d9>p3v+Q+t|d3YIt_6e%Mam4$Vaqj*{@Zvk)#Eb8K3t#>I
zzhYwlB{cXtP-#m->tp2}?SUuRuX6pRsOU!K4kFnxfg!5Ork)v$EgZ(o?$f-kZ^7(u
zMKx7rrQU<g;yP^Nd6#uLBpsI|9hFQs<rOl=me#hU7&0SE*02^^D=QXCnlpi28{JxK
zYpa4@Q;Gm=Y@*;vsnrf88Alz@zf9#5fC}_v2>`X!CcmSpsRd8oe+qLmb1=(zO;LN|
z2<Rj=76Wv4Gu;E#pOssakqXlQeZ3BbYAuWv8kou&petaRBd`?+OX)CZ$xT%{SZZjO
z>kTmK#a+mvT{^W4?Qe!Oxrn-yF@<`Q0a}xu6}BE)Grz}b!YV)M9&)p`WC8YBt%RNj
zO#v^Od@e++2K1)FIJ$EI$M$YVXEOm83#p#mcsz_K_d}6@nn&v8dt81G>|Qsmh}<j=
znEAO{s|_}350LC9NttXPgk3=|##+1)CU<~pj;c=v?J_+4JLnA~z^%^{fX5f)=Z2I$
z7BG6$>te8YqOb({d65=a;%zE*k15^`eX<<}LeCg!Q|Y_43{9jJI&LLoFtd&@G!lBo
zL?86DK04_QrWF-9nL37{7yGdHH2R);9q!xD!%b^zefA@;J@-*KpZW+47w(~A@i+=5
z_M+y<<8VCr8f>q>joKGph4#6ZP<7#P6iqIoj%v@)JpxnbAhcZrRG9;)>X9*pJ^0FB
z{2$O$6K+@jsbMTD5B@sj`G;+Pig@ymo1gyhhuq3*eu1C8{}Wo%U*daz`<F`L4bII{
zB}n#~VYEz-qj_Q&9iv0&9`48OCvOvOKT{0-(a(OaU?=-Oq2!ZKn_r$&^J`6Z4H^uU
zw5>HTm>Lj{_%XLIi(ojoGCW0qC*YG|DzrS6v@H3>Wolt<S@~;gW`SDEVcnJ-g0Glr
zPu8la{eM7DhN^7Yx{XRNN39K8!UCRZA5_pY*!{G&CR9-o2=J<FYFDhS%cG#D7M^1H
zP#Lu&+z&+%N%Cgx>W4qsrjl@qJsns$h*v)S4gC54{DoRo?(BmP;V{+T^0~Vh*n1W?
zKJq!7AvktW<w*j7$s-rgv#=khZaz&lHBN;WWVsGd`Fdb*%9O%>+RAzCx$q1gy!}ml
z>|1}0+i!ds9aMkXP%BlQQ?0aCqOqtbs52wfK7^*B1qHoO=L|w!bC^AH6>og`k8$Ct
zH}R+c_ABgt>?zfzS8NEQ#6Sy86}Mj&8b5j&J)$C;hgqQ=N9W)I&R%&6dyid4TmKwm
z7Ft^bdF52n`IS`HrL1tMzz8BqK_$sWWdLzybp<Pra$3`JRM!$N1fHAsswlj!uC5QF
zv$GQ&9UbUsZ>QL4Q;N^)rusKX)|^U|l$NXH1d{Pu;3rU(dAyySJ$U)WSFmT-9#~iz
z3Q+A7Qlj*<RwIwmga!!$NHU4b99^k2%`)9hXM|P5<6$M|(p%v)P|Xu?Ryqiagu$p@
zXOiS043nDQEwg`QCXiLF$Yv*$SLT09&oLi!*kPs{(NkS&3BgqX9!!{^i}`(MWI-Ni
zYrzn0Yr<_s%4<V^Qv^F_hB4OHj)Cr0%3BA)WM?6hwiFi&of8hX12%@m#SeGdsqXCj
zVU(~uks2O=o4^b4qy<}PLuqjxv7{=DVJa*Km76!&tWwCksr+0{S``<qwiITnLuspV
zu?BKVTY|^OlkOu7gS<##es6@3jlt%RvlfoR=!-$`k3q|Q0iwyDgwdaZIn=B|R;x0_
zXpL!M?a5x)X%nopx|ViE#{jH^lZ}uvvw$ZKoJ8-fr{H<)I%3bfg4C;TA@IzH5PAB;
z(44pm)A7gQxpWtS`!B<E_c=5?coCYXUxM!5GblcM5``1<s2-Yxwr>>rE<%r1tDaHS
zJHqV%zWRf|P^sHLkXgUx7~%8lF>@<Jm-*;J+q*J+c;zv9?aGw0UlL#Mt7HY=`|p2^
z*+cuNP$tp4xPbQQDYQ*ZVsQH+l7l_y7#ZNrAqiZYaO<hNw6bD_SAh5EFy5z4o|~Vb
zUDd$p*1_S@!tT&h*=f<!(}j_dVKoR<fG34YvZ|=Lwk&{>oKoFnnR(clmW>TtWTBc|
ztY>V^DyG$y#dTJOvZx91n^#uQPp37Kj34RS@*pdERW(@9*f9h*uUVbJp*k^SWxg5$
zPg2br&AcgCDLPnI#Lfl?Jc%ePR$rJFt-VVnrVX$Hbn=>ajxS-?sjK+*5B>#z`M-XJ
zfBYF#qJW;|qlDC9?4=#N_WFA``~1hS``R;Dx^M@h2hU>Sz!@BR><+3-Zdh2ZYP7WT
zEO%mW9kid(o&}YZ;MBdhaR2S!#?0|sFeL^N?VE)u*bEJ=aE---a=jh-wX|~qDyfkb
z=miPAK*tO^XAk3JU;jQ%-Te@L@9%zy8?Ss?&FRh81Za_?RFli-p{m=?3ij}YC$K<3
zpP&*QnAkzEkHg^Lap_!C(Nx(4tK<;OEv|yBhPEvyAKS8X7};t;|H9%z1-vR++REx`
z)N82T?RHjX5e2yZ{(kiJ^`W=7_k*oGo{~kp=mbqvZFQ1my|`4ZKqoo)atXd7T1%NC
zCz-KF$0qUmhdzSE`6bn3D{4>5iOx*mJyd#+pr<ADMDZD^%v5<|S+a_2Aly~}XQZmr
z!)TyAHs}aC?o-K053(e+5a`K3cnL{iCh_QR^_Wt7M%AGricfDbVwG9n03Tr$B)poU
zAx!bcXbQQJ^4ZWFu;awud0aTQ2i+|Zj1F}okqGb$b-W2I3@Zyi32T=WLoEE<8c0+f
zL(3Cs_WPly#dWfV2&myHtuU~Fx>Bu#U@xMQFtoKB-qub;I=kUyp>-2-exBSQ0T<+Z
zCB3>N-E>nydOaQlX_X@cW7NsQZgIiKaNDRZEe_tKZrJ&_iGQnwja)%BXjR&V)L`LI
z3bt4a+_c1QT4R~V>uT+TlhAYVSX_jYr)L=Mz7g0N{<$TzoID5Hj(u?KI|kp;3$W}w
z3CrFyh+MskHY&TumtKeY`aSrceiiW--hkuEJ(Mi%Mak4Ws>UZ#H#~}l!6DSk+P3|}
zs2UjK{t&+M{{?#T=c5h<Y4;No`sf+4uCj2Cfa+(zqPpYu$NXDTQho2A{~jZIchX*s
zp?&)jEiLUVEpFq~I9et~(K9=N&WS;u>qf+Tnh_)TF5I}PzC%9mr_w&CzUO@`E{wtD
zXh1w_Lrb%tD!`9Gz=he_88p)BNQy3%H;=HBaa&RVHB?`chFmg!$XXWbH)UYMW=VY{
zi7N$o6}%a%kxknxOF2r~qZQ!EIt_fC3{1`{_@EP`scS@KQmJY%E9lkMYt=k)Q4VU|
zWgjb0LQj<4BiPC6d^Wm+a8o<2-!Ocsc6d{rXd9Wwp^LZisjvSb{*e;<+yCufv2^Mh
znx}SSic0CJPkkNFz4vY0_~_@caN#a?UcQg9L+5bd@;$_QCltK2HXkI>fee&3N#aj;
zia^?qmdS(INwv4{@=NI3eh$s!`!GSf8SNZ{&J~5$5mL~U<s~bv0kjcTfsQG7T2|7_
zCx>?`dwcWM&)|)({0ZLv#-E{k;g}k>;%pkBy*_~RcVEG!dmlm{;TLXYrQ>ge)tf+B
ztw|+Gl_`6Y9$(Vro9#a2P-*An6{%@!c@HxrXXoUq)KM~Myqbz!Le{hh1Vd<{DCr*<
z#PG-{23TH)hK3aM+NtuSGebH!qzz7{tX5JXmJ}ADn4g=Ok;%`^qq;9a2De32fOAXR
z@$pZ*g8`Ovo6N>w*^p}{c2h^_7|0F-c_AUw)s)SZ#|cGAKr2vt6tcLKgpt-%ranrT
z`Fi;*NmMGsq|`$4+*&1npR_RxK=~fEFC8b6_<&*N_e!jCIgAMU?4)V~)s+pSeQofV
z>foWB@tJFI|LSSnzI*~raW_4^mp(rVoxYZZU9#f}!TCW9k*+#^iI!iiv61cso{hDK
zohQtZOu<dySy?bGRCJ!!E_j;>yXGFaBvC+H7Xn?~Fb8F5LI7UrNp=%xw8<VSOSh;?
zual2SM~e$7rwi>47n(@*Mn;@J(8F55L-`t}UH9@vbaE@B$UVF%JYL=$w8fGU+Ai&a
z@g{g1Tj6W&fRE7iwfDl`IiPX{278AP9vDJ=bPThnPht7;WrU_@;GSTuJii<6sa^0c
zABJ7#24A{@=&gHjUc8Q`CtgA$t&)}Mp?Ycuswd}AJ3;7;QrQg&@CH!RKZuIKG1T@G
zdVlrb6sQzq9<fG({~ke4U?dh;_E*~aq!sR$e0-%fPC)g)fB7>tmhVsg$3LKV*Ag0M
zC(*Tc7yL84nRai7pSCtRKaIAzNkj)&X^m3ljdvr`*MeY2iYh3Ki#KoJr~G*NUiJT9
z{sc>N!w9&wXm9plu#dK<J%R4d7R=EuyIJw9SO;Ebo@+@VB_ZSHq^1`YCpp`;(5j|w
z%SV>1w4Yy&+(N26f=?RMH*d|r`VE_u(vyku0zDbTEb2tkD20+8Xp!U*_8=tXdv$}J
zsze8^WN~2`lWXgh<_^D1lMAaQNZo{=)f=QD3(+~WAi+w^#|kdhJBE(21=Shx+B;vt
zKm8bZ^Gm;jvAw6UNbp^H@nd-JkN*y@e)$h^`q?*e^!{r&^2E#7f9)wOp1cfwpz#Cf
znQ3q9sM^fFI1Ij&%7olJz88bLE}(7p1iI%>A>6%)E}01w@1fI)z(_zEJP9?Rxxx}a
z^Vkl)W)8L_;l=P%@rla&2rcflSKh@p{@cG}`<c5i#rt_*%plgcfVd2f9$w}>)rBgf
zB)kZsN@qob*{M=+$x^L4lS~t{s62PFNWTmvk(BVV`dnuAN)Ek(qH<OqdIrxWZKEuf
z-__lViOFfI`$f#m&a0VM0zPTe3j_kN@j46e8XD@=bk(xb5)|hWaD<#dFEcBf*H@Oe
z%td8`20QldSI}!rwWvYK7M2YI)tXu}MIa}DBRj;_t_1f5cm$ikZ6ySBD-is3$lvlh
z!cQRfkUi2HEDKRd$g-nGqmElW%P7^MNe7)l1FccRbEHANz7DHw)_NAA26&uCG{${s
zj(OnJR}p$unCxrB3r}3a?u8+kbfr{Udh`u6Q+Y`PHa}Q;VB~;#;5vquwAJ#5iKP{Q
zS))<bWYWvpL`6kw8&AQ}B4g}iRkbdaDa9;<lBx16EzQsde6TZI9v5L{XISC#vVeOm
zYI#8~YlbjCzmY#~!sEaw?beu;$HZ++Q;CSa8cjT|m{E(6lQo#zhA`n7bbH_l(k4fE
zgT$hY82qUe0?o~|+#M>bbC{M>W`{&s+a-nv&^|MYOV2(*dqWE`HG(89Vsvr_iFrbA
zViC^Cc{t~Gz_t4@baQ**+;IvH*3PEUU9c03`mq`4#>Szi71Rw4Ktt%&^0UiF2tC%O
zU;Vqk{<YHkO>g;dD-Q|mB;;X5xc67O9Ujfr{mCzXp}_h5fB$E6Y@b8Z;xvzM1|BNC
z_`yAJEKMV@n^tOZ0^x}PB&G@WiFUXKQ}FjU!$;`(nv*;iNj!ez27dGt(tvgszxoN5
zXZw)|8PU_`$6$Ai@C##+K<(-5Qj5OG)FJ_%S_FnxLvpMY)7oayPRY$B1B_+7od7R~
z%1*X2pLSbzu^N&hsg6|Mw}(nk8o?#kn`BHjIs6DF+R@lKte_{^R%&@8t5wxl#_TKu
zc2;`+ptRw!B9k$C@kZDQL1`lnHg&8lKhRC3*F>A!Gm0eD+Kp#l#}~fyJ^c6o_!XXh
z^Rqa7<w-nv>+|@`@Bbyf`M3WIci;XJ4%~i8DZSlSpTg-UUqkEEPE^}NsI}5TxWlwa
zPBna{!A|hgIe1%#R7RA>@%^kYx1(|N033-?s^~db{p|#B6P-^KCSMa8TnUsKgDhur
zi1seR5+8yw(hqBD1Vg*e<BNatPdM}7Bl!AX{2P_yJ-BIs8%K9y?(k)d?l_5_seKB1
z^|mldseZL|G<v!ly^Su%MfXEdQQx3NMP)S#S^2D}yL^?>y`ZQ<Eqhr}tyAf>Y)&e7
zihv%fvE}W%uxIZ9?AW;nQ`55q-Y}9WDrZ045>=E`G}75oT`kLOmeAVL&eAd$$}*nm
z8LZ$ku{DeED=1JMm&Z<@#k~j5Ak5D<=}ok&gcHSyC^2akk<JCf|DohW!6`MzP-;&>
zk9>J(Q`HJ<EN7B}-eMKF6WRhYI@P)cNtjYe<yt}5%MhF*s!{&t{A^@prxW^FC@9XQ
z8s#yU<zZF5rkp;#R@LlB4=kc3<wrDVLo#B=+0*-Q<;ochjZ%fl7%g)x!Z9ZrTf)%u
z3l)8NaLTLm0JQ=?X)ts;DgE6r@}vdZI^axES+#e<5KF>ID{H38lAL%}sw`7WJAA$U
ztkq=hDxv515_E33+;+8EzRPW)&5~5^2KedcBYGX0v<>Lf)u6Ag1mnhPjPbbIG!^Kg
zLY*QYC%iVa@;%K|h)q<8QBjFu+U5jrl2lyHGzg{=2sWk=ZfQeIW{Pz7p^3-QJk*C4
zs-c+!Gr0fyJ&Y_(plfm%U811o7gWO2*1271UZ6#poJDA9H|@XYLmcWMvZqub#c
zU4(UX8s@P{n5h_z!y_;ZjY2y-j=J$_=qIM|jlcgpHFQM%Bf%aDIW=_d5%>glva5FF
zais`X+VW&qRNx=~^WUL!*F1dV14!<eM`Upp{^bSQ@MV~2b!~@s!MHpP_tGq!1i5Xd
zm&er(Ykw<TJ#DZOdWNtJv<=|G)yw$#`*Ku2#`4?%+L8_obcWE=8bV{#i$nW%BNPb1
zViw@pR92o9YimJObqyqs+e+ffbk#m5A@Ix2E#{3OFr+fbUdi06CeBM<H?hGRwy4Ks
zB{RLnrP3%l{Bbp_!srO7Rm(I~Fw#-tWZAOWc+)!E@P~LEV=2DJ2^aS*-Y`KEL+9uW
zS_UVP>Ki9~hS4%KgP!SKxc}NG@wGqxYkc<~{}V^9J&QB<KZGxS_rK#Gd6K{Sga3hx
zFMb?Lgw39-&*Bi_H@yEG26rBZ&fkPuTTrR1o?$94IusL+&)d>Vi#h;z%P^WocEQ(1
z%i287@7sxbYXapOFDvm@)R}092|Zc6Hm{cFF5F4WyZ~!_5QY$cW3(4>+T`E<%YP^I
zK8$yN@2^y*=D|JZ5aR0=j^Dt-kt+&(_GmZnEvoMEdDQWDODEvy>;e+3yz2=VSxo)W
z)G}GPzp$8UkHE_>Ch%&_uu^qKn|hFJ=|)%YFcy|~<LL1-ICSJB<`;LMyLSLFiW6Dc
zOe!?#S=DGXYFJB|3}h}WKw&{X3JJYjUhj-_UhlLtY)nr>PDu&8kti<TxP|=(kI=CN
zVAL65<LB6=f}~oM_Cw*uiZU}2WZWv?>6IeWXzQrx>rh!;soEY(O65BxjlC>hB#R(P
zZa`TOek0Pewjet%gHX@nvE}m^O9|(4Mj5o^L0w%b>~<a6I^q}{?m#QI@stm%j5afC
zPBSi@+KbbN77(M-OSDKsa18s;9LDk6HxVBmReAFQR7n<x7E?0=@CRHZut5>mDBaI?
z%2X#6fY$}1%K^L3N4pV*pC?dOH}f~Q(c-pI>G6gm0QKEe41Kh=U7c`IbvbB<?F6NC
zh)6yzr;ndc(Ag+eBQ`6Vc+whKi?!6%qgzkgrKv)PrVOLzY784H(M#Ho^R>%_@pis%
zl!{>3??Z3MhlHOtIp~Ei5mTTGQrQF(2_)Lu)zr8+)n*?HKwnQEx(C~E@XQkKy?P!O
z?jFL_@)Y_fCosG;kHLjS4Dsgap?z&%T0(4o3eovl#1<D3oSB1nd<MRWc`B_1I40&`
z8Ji_wW>gr53A@25=y}sPMrQHNAN=jF+tLU!+5aK`5|}BfDP_m_4FG;XJ3c0_|K7j;
z3$5!2f^)QSgtKRH3Vy0R=k_IN_Z@_mHE#8Zqo_W52$ehcqJGD2)XYvmGd)a;-HnEx
zE?$QYG&CmA5b?tl_TcEbBmCqaVehU<bToO<OW<{+0%}IiC~dkd<tU--Eh{-GyjrbB
zjrrQVc{5h8UX3+t)?(vE$%xF@$O0;veK)76{VlY$S&|oy5X_RH6xl@tU>=IgYE@?A
zT8#x|)mkby+Ed<`W-1bs-NSO{hSS4JkqXajW%=?1VfRKbFh0j}mO@0DOX6*;yr{hB
zA{uFpn|jC5JGmXrgR|Iq@+z)9_ZD9H%(rm)#ZTej&DZduul*T*_n+R!m;d75@u}ba
zYux<s=W+D*t2q7O4Q#*g0DU{ps4;kh%O|MnI-vKYU=OvZEX%ei!PPU3sl8|5h<8IW
zHup{KhRNFqgF6YGGe#%{;Y;?QcVfF*zOh_uSLxr4w7Q*RJ6XxM!I|vkXSX5PIfiFH
z@g=<X2Y-pzzxqdb_YeObOXr`0t8)Qf#(_)EVeZgnLVTK5I}UfO9eNibA4u>!+R@rS
z4re%pvU&rGs~S*TS%(~2S;@YVS6Yh(i&q`DjNMChPpF>pNxGIpC(h%<*-O~7{}}oQ
z$JI)MF1Mcw%L*-lRae(QODn)DM{zOLQ(+PEXk{gZl}tU80p@ErZp6ln43yQ@BFTH|
z+@;HCYwv=S*Uaz`deVz4&=YqkLP>0?bS%{ABt4ikeMrjtO>_fkgthcAm(!-#(S}zO
zh7G*_e6LLlvqJ-mQ-j9#DB?{4*j;*-!+J!*4kTmJoTcS=RUzOLuvYMQlp`E5qq*6Q
zRLTLLPlHvW&RS?Kubw}QJ<B5qMBNCtQ4O`l5FPABgkIm#GmLoO7$PmLv>KI^#dfr|
zrC8JJU~|!@JL#)El%4@MtUfmZMvEE<z~u|T;g!yWATN3d=1>&YL?f(iZP0YIz|h$W
zXLFO9vEgyif;w232~I0N)5y=1w9_^!yMSJgl+J)wjSiiZ+TD5$n(HdiW2nXmp*PS_
zh9ONYb~+t6PTRZRYQnfqk0I77Jv=6vQWuVf5K6F?OQzI1vyJVoXyr{GZ)-)*;0W4V
zdoe;awzy{+Z@l*$e*bU3fUD1+!Q_qwj4$uR5TPf-KgMVGVshyadKQ<^wlIqpN!dO(
zizYtTG{+j3wQhj$^AdW_iCI_~X8vuY^3x4X5_(f`FuwCQ|A_x2Uc{bCo19D!`*+ez
z9uuIcp?3^XbMlxx|GybBa9HAx{_&p?o0&v_3f(z704FW1gTS*>k=l0?Zih~w`ut^7
zT)K$L#~wrFsmD-#;3TSe+%>f7^^=pR?C(cyZ!fA_notuDp-eI&hn?7eWCyk{jndkN
zG1Qf$(u-j)&qFL4Ru)q$Lk3jiu~?6l#_gP(Y^+}|%Q>!Ai~mRxf%WS*s5DYCEJY;<
zWhgr<<GHd5X>;?bLaI<xZ$`b&j*<$EYWI_D#*t(z3v~y}f}fS3U!^;eOdw7-)srN-
zb$C@$(U9EmESmzn#_mxB2)xeWSu_)P&3%&u-z1`)qd0!`X@cWP9Ju@}uD$j-JoDZk
z;G^IETfF+!KgFe&K7*roK7<Eve+^H(`DL7a>McwkyMggTSJ1!XEc%vCVw3<(_RkZ1
zjR>|5Bh)bgf7>`LVd)+2f;&nG*~4@yji}bU3B3rkjtIrVI1Zn`g&LC=vV5b}8%J}`
z1e*E?>qI}yRDzZ;VLdQ|#S@R?D}VYAs#E3luYDhHe)F$!?1?wgwRi^Mo+V80e+-R0
z23M>T7JrKHlpf?}s?G$Dp$)#qF4USmsJD1fD(nAN)+4{79#wjWdc8Z*LqPYy5~RCn
z9mLG?0h~C04JXcB!Tj<*I=F5sXdhjijW$`Y7EiCPu0@5+buOzQ^h#A5;iFCwY0Hyk
zD7P}!Nrn~Pn}ZKCF-f8V8CD|e(m1FZ9}QiRJ^-uBuExz36Kvb4(z1)PQ9*TPbsKqq
z2GQQtjJ|<xBvMgcb0^}BVKlZy$tF81LmMK5SMNwORlZDntVSScLRSZ0lOR7Vl~k8S
zwAL~Vr<UJYM2J@-7}OAqIwayItkT;}n3|i!#q-C|+uKa*7eKg^wy%elw5tW)epv}?
z9EpJ;v<(fwLc1xeU=5A*BOqyqsG{steP*8n)*x$SpNHV{s-&8BkEl1lD!4|5!9%$l
z2-8kQp^ZjiVA%a3xSSriX_s8mT`f}AN=a?j(zfV%(g;>fJ^Zw^N!r^c%HU?!4!ybt
z+FBikl+C5$)6`&@igPbd=&VkI8J^rBhaH`Sa8r<KFC0WX97c0AiMGaOUf>j(yPME9
z(27p}mhRCx3@_}&sk;~Ph3|a<U;V4EV)xa<7}~oRQ-@Drbk8nK?%ji#Jx4GyzZVlr
zdojMa8>90C;><F72sz0d-8ei!Wjuk1%#R(Pgp(@KJTXQ~JA#I>VbqTgLq9fxqfbAJ
zJ8!*>+i$;x+n;(1H{X2=S3mg?+<xa{xb^Y3@YK7X!Gq6y9(TX^72N*%*Ky+;U&HmU
zei64m{~qpq_A`j@+yUF<D11BSV3_EKbAE!>bpo#C?WmhyM){$WC_eibDz08Z)wQdr
zyL1C}N6$h>wWpn#Ma9?zN(QL-hDK1+*@@bwBx-p5%1w2&lv?z3%X#t>coB>ZbYS};
z6{>?3oRzh<LzLc1kC{weDy0RHv_g{lU7#n~dmjOB{rZg`08f<P%DB7?WK-eg6L3|v
z2GmjA3HVCN8&FVE!`k11RBJb4GTl&yC;7wh%Iuwo&~v!FtWZOUrrM|mC8@=LQhI$8
zi%8MVw)Rg@oiC|D$xXczD(m*uXWqc6+b`qt^Pj-Iw+Oe7|2D3?{zdG+{xVhFlQ?|y
zWn6jr9b9<kEnIu`GdTD3$FOwvK1TLl!o-2gm_2+IgG<M$?r3ir1B*v!PkYchx{S6F
zD*JE?^aNd{)`@DP7bOi&)L6m<OgrspBRq*-#M;I%zHkWPW{Qi(F*qUv@FquK^tY>d
zvx~>C;LG3p2YmGF-^ZOdzlOKI`B(V-cmECd-})`pSrYG^g)h|$SG0|)tpnaf2bE<U
ze%e)gv=t4`u*$k3!?$XU4%N|7ZEz64Vc4ip?S!)>LSPcqjosszCiM0mJ&)zR$Iv%2
zgFvJSI-?zR4F)wPvZ|Ukv$9%^$tx`_|Dc^v)SX%&TrEGj73()|p}kF~DtDrroRGoc
zYJC*RtYW77Q~M^>A)@0wTvSnlEjineQ(j0&Ygl1;k?KgIy{`qG{cY&zX+^3z27kzn
zL}LW~!(B);2Vu0;!W}Rm+3KNa@xbM;hs#?>73ccDn>x?vwy!hIJ5FX2cgvQoLW#Y1
z5(Ehl9Rx^#=#{7>*n96)5-EzRR`1Pfwk3B<mL*%3Y*|(u+i~m^Co_{wW_EMt?CjZd
z_QQVJZ_hsW&(+L)xCbB#LE?Y!^ZuS&-WQfS83k`MiVN05r^$oWQbE^R!akcu0nUa-
zOZULXZS|FStU@d2Ki!3)@gZ7KE%Z(bp2rHE-ws3A2TOActj(>c4FzEKI#H=rNt=+J
zUI+YqzPwt-3!rAilB0;0R7P<T`}0cJIm;<>@@jF?LaeH3rAg-e%4+1Y)8<hS3wXb@
ztdbVC3>5|ZQl$MjdC~-BR$(^s#bOCXuZSMMf~c#e5NgZvpf8}6<!2=*aKWP0#f&Zr
zaS!RRm$cW$!PqXBAwp`jD~Ppfm9z)ZV$j1$(sWrY2t~c<8*jttLKi#<A8d(kG)^z#
z@Qv5;>A!x1_x^An-TRi%v*#cN_8vj&+zi@gCeb=Mhxp(W8v7;?>l{I-wIAWuK15o3
z;g7e%ODh{_>3}5?LrpXaEd@?Vp{s~D(B=kE7!UF`gi@bK9uP)ZgAb(<KMErr<Tf}_
z6!Vh^{m2bBATtt1YEvAkZOuq+XKRijgTk2;4WXbZhQdT#64A1b77|_~iuqW6a~sMg
z=8!S7h@>ssk+ypeQdU-wxqTNh`M7Lg1f@b0V+o`*wjeDM$3_zTW}^Y=S|u_>@oq&i
zY!o|>!wA3Agx-!uI7#+m#Xg7REpntud-K@D@uinudWqupcX;fv)jGC9@Kzgb4@FLh
z-OohQP&Tm<Sxl}cZAynI(k+@;=n(nyGO~;L`}w=e719why}6!*r;%1?RScA(d_e^%
zZ&moJp)=M|^jxUam|*k8NZ`YWwDrRoY9{H$V0M$>!|e!ljN;U_kFoR2+gLvRF1DX}
zA1fC=!t{|hFtBnOom&oKnu0j6{TO<;9hMdocD{N8gDYn+`N|b+KlL73CifySwt`U4
zJT%TGLO~1iNNK{BD(ZaF>TIq|%RX(C=I;s=7K%_Ek_$<H20w%4BB7y=);kQnEkWUm
za@;gh3~4ug-H7!qNT)?U`u4AQ<Fnu6;wL{~+o|{Q>F@s+h4A0`xpUCiBa*o0$p{Zs
zqQF!gf0sCe>p&sxbGg2rLaXE$(_$09%d`R+@+(AwTQiE)78I(iD6bK9ryF4lw!#x>
zhqEDp8k-+F5_6Th1_i|xQvHzB^lWJb_>tgk5G5jCS|<sfsJ$VkGDQ0st?gq!c^vCD
zZ6=8sC3hz>--tP1Y4L%ACz{BlomY^D<n$Cgzy3vJam<!d)YMi33=UBd&;YZO6zr~p
z-D^c{eGRlm6>p93_-)V{<m?Oiu-H{_2gEVeT>eag)8(PIt`vIyEGtVwWkoWK`h2<)
zal(YKBnZz~1f!`K24ewCW|E%CWkmnXAYAbnDn%VuzYAKQ9ohyrGy*ZBVMduK?2Qem
zGLzCOC{h$Gg|-^O#vru(qsw)qJ259jqLPceacT{%u8zdVrqD^t{Dm@+-IiikPV|>4
zkk5`?SW-!4W+ccX;pNe@i|h-ev}~~=!<I`y6B`msXlKhQgmOlO3i|h|qD<K3C2*A&
z!%9&MRFt5rx*R=~Wo$|ekPO<&N)aiebr)M)v>I4Vdf4h}V7Js!?agTKiQw9e)42W1
z*Rk)?J~S`w#?0k+aPHgB@X^2h9*6FJh`!?|u=4(=*ms?l=j<8uuPmWuVJkX!>_N-o
zE;P>XfS=^&XzPWqtp~xDPWYlt@X;m*D0;p~1Wr+)n*vrF4#LQ$2?wAeF>Aw7R0c@S
zK`)9UKIDbm$fD@wMg7QNFiDAru(2_Wb)>nCiAJQbWyGR<K8gai()Lzl@Y)o0bRsJn
zLvdr27PlQGy+g<wnnK<@1#e+HvQ~B@W$PkRXBUt)JPyUcFl23A6w@}OwRT}+AdI!N
z`<qQhBr8Y<rTI`BtKlZ?2RwCXVsPl`q-B;;#1|lkL8(C4S`wdF$Vg2~mliNY$CrwD
zKmJoGPoAj2CAOrjCJPmXf1Z@KrHHCqo0G*+vTSTh&cynSsnRO1R1+hceV|N12T==e
zAS#Jk6`g>fJhc!#TBb@0-s-V42C;f^zBT}h(=XXvoyEh!6M!$?i53Q>I&WOcjC}Cb
zcX0ZxJ6Jk?6$?kMV*1eQSfcQ?FC0SO(osz8Ig81?XVJ5;AH7=+U~v0!3f((6@!prX
zdjC&&`{AE4b>IrdcArPzwxdXlufP)TM{T4V#d?Rd_b!$8HH-E(wMd1W3KKGAW>S?O
zvYIdqo-P>Nop3je(}KpKwj^M5^pf^E;BT76!0ZuB?l=u=bP%zj?U*2aUZw56_W7R>
z9bUoxAO9UkuY8C)KLcMrNvv2dZJ&{8Y9Xi4>Fhy{O*fMAC|I<&n@G;jtWU$bw0z{0
zt0CiSiZ%68wwE%a4+aW$4Q;o&&MPHP$mV;*zDO}gnp;@L>yp8i&cKu+iJhp7ww8pq
zhIaN@(w(6CCn<O$7vkeT{h3r-S5y<!Q1nFh6;WSDke*ls5i75P@N($X)+cR1N@l8*
zxu=>Wr*@baKupjwxa#bB7@bC5KRq-iLJFUA5WZYK8!Qw=g{l}DeJSiNHR+F{D%%VL
zh0N(v(y|sorAUQNoe7gMpQM+CD*mjf&Y)Fh0CiQuYA=T`poF<DACLK(Jm{YvhTLwa
z$c3TyxuNzup=$6#7xtqj=!QAqfHfkpSjh;(FHxmODF;X$|0sV`2ql^-UR)wA#idxM
z=PjF>jf7WJrDe2HOBX4J9a@x@%w@|il1uYAxw$3C&nsplMdaqu+T<cr2p`2xP=&Bh
zr6r_5Qbrj`ucR~w6{I}{$xlNu)D{=PK!NnqmPJc*5iTo7m)?j@3UF8<LoG?qNXj#j
zV$HP_KH6FjBgy<?KkohNO+5U=yNvqF=sa);Ti(Bo*Z=8zy#Kc!v6Z%S^3t0)bniQC
zyLuCIr_W&i)NyRRbP0Q}-^9cP4xG2IWAgL`M5Y!H=w*lP=|`-!9Z_1x7)37{j!65R
z+!Qx^umN>!H4P2W@mAXqMy216@}Lhg+Uv3qiBC{pLjd_~>7>D=SOeCxCGmbLg;VVJ
zE8uMr#V&)dEoe?4FVTvU)($AT+K>|oB8&I)2F8#zG>$YjL3b$>yOe=G6b^8p_4cAP
zo<LG4hRr0s^<F>LTdYVk*I+YAIZqVs_RwZHC_HQ(%@G7>Hwp`~Q6^(>B3+9!lp;r(
z*rp;jyNazRk3A+t?{N+SksMSM?_L!=QRPg`;fbQq8<SF`wr>3<(w<n6+ko{OlBK<4
zNt?xa+9DK{R<SeJBSLC6lJqnN6K#%O+MXf=Pc#*6uEA(xGfPFQ%(TLK3ZBmDg@FR#
zj&{HiYQ@~nBUm|l5j&1u!J#WRF}CZBL~nyT&tZJu1=8I`ES;bb9=eD?*Cgq!8K#C#
z()><}+Y#E}&+zK)-{9o+uhBMt1R)Aga9{~dlY20<ass>0zYC4K2`WcKnwQJ1GNIhy
zMy0s{-qs1y{vh0~w8Vja=-l0KHcr4E8-vl^1G8rcU1NtavwRV~QwI@fn}#{q1HG>k
zo{kyJAASSJ-@1>X<x^;z*oEeyEmBIWTrq}(=#?s<G}=%fXp&AIWeLhunkcwhyqKDY
z=aVwAHYHaQ!BVZ2P(qtsP4UxvC^RHY+T>DI9nuP`NQC0ps8&iTFE)X{w4UVsvaqu2
zq!Sm<J-3<-M+lzCe<+FGFP@O<82^H|;{2<Upd*!0rQqc=80Hs<&A}p%Ua8~;(lXMd
zMGFB-_67zgZ#~rRT2#`u%FMK?wMwY#H4F%9C~FijTJ;FVd<fD3S?vaxY89|K)o{Ai
z(CQ0F`<WCx4g5YGwCZdqE0YN{CGfecV678WmrKE`L~DnQu1*K1M~0U68r0kK@K}?m
zh~L|UBAX4$Xd^1UE+|QR6}&Zaz&Ya{_!Az~HCR!itC9-bl&g%8sg+O=KYdMM=o}W5
ziAiq$<2lu$>5y5aK^9v!zsPLjb~Z6Plfsw5PMyY!mPXM_Eg)Ht^fDM-GKxz0b0IH6
z5i%JaviWmfQ7LIcAac31tGkSZQAWCxl@&orAyg8XwF+K*5{RiJ1Ge&P)RX%98vJOo
z*1(~wB<d<rYcr#k16Xg?!fvZYQ==Cr&o1M^cW>d=R~ImTa0FYgUBRhuzr)S{`7gNm
z`>!#7^&0lvxQnYl{5KrB`6Z4}=+1un3C@1;00-{e#n8D+*z(r<7(4wc!qfBciN%hN
zF2q_|kZ5k9=tU*rYasQx8zQh$WX#b74Dl9dB26T{7}ONR>M#QhY0wZ1!Q>0U=n8W1
zg-}83D+DelDq4{CmyJQFFzQ2bJb;RpxU~PTfS*y$&&y(<Drt!!x2=`J*N)6Y0_haD
zjHXUDu~9k*Wp4sH+T?t%7aKi3Y@ir#usg7!#(*S+9Gi;sP+ncZpkPD5U5_|BXLnm8
z3>qZ^PXR4$A;ifWkp^6dURru4?am8Qy$d0Dt7+hWA+6L($pfBxmd~$N(O<2nnoO}<
zN8(#Up<G?X71WoUl0iE~vd^a#q8l)pZHV!2Hr3ftBl0bh^28=uIVoOb^A)0})fqVe
z{AeUyixg5SlY{P}7InTxxEs4ADzZm9(KE9PW6Q^I;pVTfNP3$+`W9x6T*HoY?_=Tc
zWh6!yQ5Wun-rEFQyhpn4>zF|I!U1eK{w8)^yoLQ&KjZt}M%%(M#K!ls?U5EbTG{Fw
zhfqQx$*!!0%0_yxZGgXhQc4>i9oUZO;0huGJJ2}13nR;C__v%TWE{iJW7o0g_=jj7
z*on#Qr_nU9NYU$pxq*Th?nAJ53ug9TM#t1X$@<27rcq_EA&robnq7!Yj+<iAS%F-K
zVv1dMg%0VZBw>>iMJkhY)^~%jxW!e35{^%bUY^p5d<|`^B}B{ZM4`z3OhU~SG+13D
zIkM!8T)en032S(r#I<|+ndf*Nh0uvTl_DdKSd{vORQB<yC!Un%b49XI868y)$8T8$
z1vy{r!7EwK#!^@aDOX>4Da-^8JGI3cq{h(FDqS_GcC)#}eqJ>c)@l+f1<+Uti-&O1
z;DDjF8ftwx++HIhQ9@Qg2dh<%fWH=DYLe4N3v4Vw&{K`}goVz;0Iy4qU{HnDb_=@u
zoM`Rj?+TQmsZ~XHkcP+Fhufg_TTtwEqu3v$*twzbvjqdtg}ex~xG}RdfOu~dGKyS&
zC2cG%tJp+bR$WOy&OfBN2?Z*ZR46A;Yea@phh(`5n=4dEDpw+j(J84^iA~}RVwqBE
zsb$qj;bSSJzEqiLmAo%QT6qO8AW5*K0tI|uA>UI#kt{4G(w51kXd(ujLN1oW3u%9N
zp-FU_$~@@hIWVirF+1818wF0QsesX<g}>1cFNw5<!sl|;p{pZ=_phJEcYpi{XWrh0
zrr8+9>lM8955L35|KqPX_cev><}Dn%bssl=_cy%p@DF(VoA2@7Z-0xo|L`MDe({i$
z_W`!N_9lAv9YlP34lNT??7)MFHa8<d;S;;sqm2ppV$HC{T473bLEkR29FL%?YY>$b
zrmB`^$ocvT3YR7jgUQS045Nm(RsImEFNorhUrHWYLGqRbTqt2jC<!@fi`|gZE|-%W
zE8|g=(H0joGKym0*?etjEXdaeDR5B~@pl#ZOfYwn+$nxJb!MbE9e9aW|78ZNB((yY
zWktxVD5BtK*pW=|Fi^C|!)S?yP{|G}7j##)dPcHHSlg0P>A<<Lv5zS4r#}<bx1N?n
z?Zvh0C9xAtWJ4C#R%~4nLMZOPz{f-rVwj$hL&`6r1tQT`C{ge5GQu}X6lbipqFN*r
zpmUHis!;rNq`x|=gOT3{2LpuF9fH{vhOMC$zGhx~ie0380v!{}nBIK~!xX)($F5=b
zrCZo>0T>+T<KZ^0-1`x4Kll;d^ZTUC$mT$+^q$&43mg<af5(_ad+l@k(KxvmBYQ95
z*n3}5EZ>rZuWjKd;uHH29odDh`NI^;5xASj5a^ha1kcel0(;{qtg#X30(~$=hDmR8
z=$bo(nSGbgIlUjlTTh~)ZJuog4ee8G6L5*5R^cv^VF$F{c7*%3Vr1z!5(8V&GB8Jx
zGtdQ<NgKFD!SmFd5^T;cApt6}UYxltsv^~i(?)J=$}2+>pU>hrtDsm4k;|?m5mKC`
zvXokzw8EUh_vFjesGv}a5~rKevL(?IN7%%5d*)fO_wwog5IbRM#X^-JJyF_ISl23o
zhscmD`h-~B7xQ^yBeAfz;@F@#iBa#c!7KK_GB}%QS#{JLmCsDkGeYGykot5`JGIc*
zwb0qLFxqtRg~jwm4ZH#NMfTlz+=4(rgILteG2DQzwg$8{xfqyTm>6onV7Cj2m>!9^
z0X^LgjEn@)-Rr>U1cPI<8cod#(qX0$y-pMu709;Np)}HfauQyJM>ID~q>A3LIHtF>
zqp>?8t<IK^<V2b$tJi}Hjf(cQ60TSb^(32YrCO>1o1-@)sZvdP&|(b*Yds^#%Q6+#
zllESs&~21yD29B#oT5l!+{li;v4R6nPBD~MBb{G<CZEeHt(GoKK9RjpifU@)P$0`x
zT3Reoa*>~3twO1~3e}o2sH+R$tk+^_u!93e3w<5GOrrAm)-go-7|oruZEg$ZrhD-9
zH`npyckg5Cp)oYiHeu=FUfloNAMp9#{(?8Z{TlCm`)hpi!=G^P`@iDWH~)a^-~Ar9
z|LHII=+A$}weNn1i=REju1i-jxML4GX6Df`Hi6du0VFy)(A3tBP@;vz(E?jbH}q}&
zQ1p(&w0r=T!{<@8vLEuLB@|6hK)JLHRZ}yNwf8|5ZiSlGteUrFp*YI;y_O35>vM7N
zcu9axC_PpZqL_YgL(QP1<YVd}ZzB{se?7_rR>%YF_`W)6nxeeU1x39BwIL(?ZS|<G
z*P+m;qn$Niot8mI%kRCS02w7&P^iinH5KsNY7lhPVXU`PTKtiV6<gXAan`Z4g29NL
zJ3F^P8WcZyT$FSCJ88~NI#u!F%UH_+BQ_?Bs`^6Y9#z{HO^}12Ly>8BE$vfsavI4%
z?7T0Rjzv`}G?L(nxjL2J#3)Z2YqCOXFiSSqY^jIM<%OM;Cl)*EJz-cV>?*5Y61tw5
zo%}yX(LKErbNkLBKC~5c2QQ&_@i@+X^lQ9w^>duR{XKs3Z~s>ky}D4BlnUP6NU>`f
zKz+C!(cUSznulNt^}tEn87JAz9eW!)FMf<|r$0dV(rNaYqqMra(7ojZVnaI->{{S^
z#wmEC6uc4C(b}5A1Je7$gWGAX&th=t7#c^mqkVE0Jc%*bNOY~k+n}|F**7Bywhxo^
zT4C_FK<jCNDcFs^`2$i(%l_#VlvEj|v$aKpfsH9yczIKXR4MMo%{kbRS&VSU5K@JR
zRp})c^}^<C$>NHI45itFLQ&sEQHOkG9STT!Srki2^f(rUT}~E@O0>2j4V5@qBk~=J
z(-vY=un;@3SS7?yx+Uoq<`+m=mc>a6adJW=oDiZXHZ@mN5MokNrQ>UM23TJx02hO?
zo&DD6wLl-VQuIi99zE1-8jlb?4YX`VmyW?-2b;GBZoirEU`Kn02hnH^A|WjXdcD{-
z--D5X7-lCDSl-fsm94E9=(3`t*@*V$8f=+QU}7SI$;lXYtnhp6x1ps)gTcXCJmv@y
z%{BD&K0ArRiz1Sn+${!@pBK)JiTPHHPl-u+8%ju2#fmCa>h<tMBK%o_3M~bxN&#<k
zD~$dI{sRFNn(MKFMEF8^6;kx{?cpY5Tike_LY8W%$J6;0*if#;#&QkTQw%qd3O5Ro
z<ZZHCgEYC8*4Rk$(;!7=K!(DEOd@<bFJ`*1%OuG{bq#S{hb+=%fmTKGEQeB6jNYyo
zf_?|ICKIakj20mW`X_qfAdRYB4r$f)^vPZL{<ojv#+@tZo^3<tmQI}h;5dH$x8LCQ
z58vSW*Wcjo_dnpl4?p6o-~1!){pv^D{53`IPk+Y8|M*v2|KX3gM$tR^`dgUYdjJFT
zThTo}fwlpPUS}s7X>%hTJ#csQp@xE|9hgS#%3;)>egpcmZ=&kNB~%_gfvO`%U_5&s
zrOUgJ-aCrSwgJfcCLr%0N9FJssz-)V*+vqi;At3y^ltuNw4M3@EsBeF%2&^yt^D5k
zI3H7aNk)DXU#Fo}U?8xPc5CZlFq;vM*T550K%o;y1eM67;3Zd+tSd=a6?w=lNP$LK
z1W%nBVW%0bAs;$PkXl|_vAI;N#EDKVCaFb47pa7vdG-a89<2|>P0-t;_DJkjZSGUj
zoZo*Hy=SF4qBYV4d0LvNw=9xZh-4AsG>(>*wS!j3g=(FNl*fTV_aJJq3bAwff^d;s
z{p?eAUzCK`08e9=ls5hFrT4LT_!1($vxxQ2V_@q+xZB3y@1Di(S8w3d^{;U0)8Avy
zl~3^SFaI0we(`(h?5sW74R@>?_D~xgK{uKQXZamALQml-uZ_UfK1(~hAKTAfm+bDG
zu)6an(KL0Gw6_l}lLtv<OK9j`kVMZC9fX~t)-uU}IClWcr{BTg_G9pNOrk#CM{yiP
zmfQqYeMm|*RiLavE`z#AhT&-;83tP^dd-lTf^a8BCD9w-#z3jDA}y;J>o%rg{pJkm
z<jRKBT%=RrQVZlL*VbW!cs{)V$+Wj?k}~kz`V<PA9HrGJX%4ScTaRLusP|h-U=i73
zM5?SRY^LBn_fis`e3q7$*Y#-<o>-uI>KSnjS66CR_g?;;M0%p0v6?PRBp%Jp6Lan2
zNTQgIDv?AlpMoedo`_@x!B7Ld4Sv{sPSgjSF!@REE<Kc_JEdDo($jKms-ba;?Q%l&
zw6Kxx>OBV3*{b1isc3C$5RVzqnyA6VNC2aQehl||u(Z&GgL}HMb1{ytRt<XE^_ZV-
zz}AHXW~Li)?8q$k?HR$2<u1Ikw+{#Qb>lIk!$KM`qSR|e9y@z6BfP@npm^CCIU-ox
z(TCxY2Ix)n=c2N`NG#gYNU_t9xKv1|XP1kDI`s1X_6~Lu3(_e>n@NQ0lq%#oJjkqf
zBGcwUK5dA;eGr?}7HlZhVpEwAJ~b&%2%d(dr$tJo9w~AIQWa(-D{A@LdZgC4u|aD^
z645=)XhVwHg!~!@(p2m$q=4euT2#}{_Vza8^6A5f`9084Y;{Hp+FRooogC!<<CV_U
zHnvCc;mz0Z-ETj~(etmMWvm_j+xqeF$J_Y+pTESdho9m4z5Dq1!2{g=;!8aE;v0PW
z`Pca1!LRVaw?E+K_kYBP-~JBo-1`#e-g*zqM~-7`aS6RslV~3rM00O1VqHB5cJ#v2
zJA#_75m@GSA$IY7c;EO4vZJq|=-{g;edQ#SN6tZe`U+BJSMWmrG}7mHLUHI6l!s5C
zWY<o}mlsh!+>b(9Od080#ikKt=e0u>u#x1f9GHxnP6{BY*5WoJ7PG<|)WYObLs46a
z8gmV5YcyzPXR%oFA+IV#nX-~LrxaO=5{dBgOEOSWkPNF{hLBCqX2ftuGlEV#WYU3g
zk;X^V!B`bO5zU1yeeU_S6tY!;6J;L7;)SrgkD6G-kTR)I>;yH4ChU?Ry%%0sD{Xoe
z89&7Ce_>t4O07a;K%%_|+M3l2+B#FcboxR|t7~=A_C}i!inq}EBuD~L+NDO)(j3Wd
z2Tr~9F;2aGN2-6(KCuG}`_H3kWD)MxF)SQ@4fBUz$KvVtFn#DX9C+hn{PX_-2DTkQ
zplui~il!}?K+pIVs4b*xg99ZxCyI5nf+E9lY)E>4+srFiJoz3Lj$g;{?l%w`+>Q2`
zBWN7n2`{P6-!a4c%NSTZipjm_vE|5Xe0~pu?l5g@E0m<V-0E87D$S^>^`M*(kWo&j
zlUIVIf^sP_u07rjgSQpdNIz=49f<cXVCj`}QkwW8TG5Tk+0xOnyy6Nevr2X`;Ucdb
zPp?bGGwW0FEQL;NPEN|m!%Lgeu|6eRs*jtJTZX&}0t10UY-P#g=Vlej_*_2LtlNwy
zD0ojiC4_F((u(|hKmY$iC+716?fw1Fe@>Er3Dwmi1CkQ?1qJ_ECxs#>npm;TOwWWy
ztwt~yM659`k)AZ4#{g{()I&v@Q+X(O6g!o(noY^RtU;~ci29%jW(r_K+=e#V+Kx^S
z5(zPfr$juWL~myet*wqCr^5K42@BKpn4hwsx1$Op{YLEE-iX<$5H4R>!NN=%mbdlc
z!Ws4t3gDqVgLtgo?<P*WQNoU%M>0?aX`yHVYlC(S(FRV=H1h-NAg|1k5)G6oiV$t5
z?Naj}mXi>Px7ozo%u-qt(v`on9U8wEQ%4V@ZFxJgZ4{|mD^lz1DDZ}nVs_z0nI0QU
zb=Xuwn_EV~<AvPJi;&C<l1jl#sWf3732(FBiBww{IYj<V_7GBC5#)K}NHe*SuD4?o
z=`YXbLw1cB1`^MetLO3VwM+E%QHr#KgCv0Ap+58v_wox4z!ZsNa$y`_eSHUazx)_;
zJ84U&`e<>t;gA3Qm-z0-yZHD%3GkD<xc%up++n+a?*Tsl;w#*I@JrnI<u~}~>+f;v
ztKZ;*JNNP0yVtS*)EUg~Si!*bG&)Cz(K67FSWho}?cH#7(;{_`qJCjF8ZW&E_vH^z
za^O62_MAlewtXmn<us~~zlM}8uV6DHdGWCeC^>Zjo0fMXXV)$iY@J78e-A}3OpEKG
z6>#(aaH866LlrH8oFXT#m5%i12)HmaGl196@51>rizE+PUb_<Y76aUMTEwC{X!Kbq
zr_dEw$&pc3h`h>TWag(LCp!s>(hP*e2@KlbC`ELnyA5WY4#kBf5Ebf+rSrBj6j1Q8
zvU8->SV48d){4aov4|m7ZiUzhkrS29M0?`Nrzm!U@K!goihW*>QXGlYMnd#N>T$6x
zrAn=Xg+VfsXot>hgUV1RH7$j&&f$gC<)_#+vNfTjZxo}m+mYxU$Bx72vF+eF>^Sin
zu6_C~h87P>MS%94zJ}R7r=;p<;htG(Zf|VYY3zFS11z3;8&~f98n=J_7usP4CVv7k
z+CzQ4Pii__kVIrfp{gE*8YhY<ewJuI?eZeprYU@9ZeroYbquXsK;O2L=v_F1CW_tQ
zwj-F|e-X<k-;}n(Y&~)Xk%0wiWjK|6TDX@Y(pH#OT-ezX2J&=T%iJoC1&W?I(1dvZ
zEJb^T<io(zK7*~VT!6Q6K&tHKpvWhs=3o=;?dH^MTGD(;(2_C>@yxmu{Cv%NJWByf
z5%~x67@Sfvu;%5Bct)IgUYoR<u0OjN$=OBNn4XKZ8!3`&H%LXCo>{Y&geUgli7APv
zdF>dy#1S(#A$&i1{BgXn_C-nH)LJ^MqE*3LE&W-<{y`@|+RMqwMLEgOUT<T+iz3Ya
z>kkWi>p`8@Mj@|*hN7qU=@<wMBwGgTa1ESs3rX7sf1?e}-2nza_60r`Y|tSbQlTZG
zLr1HQ_EpIyM|WEV`Z~%n+$X2->9J#5J@)So;lz<HTsl98*DmhB*kA~=V+mY3y^Jeo
zwqt&@fuiSg!IPkU?QDXR6l7}&LE|>T5wc)%t{cOnO>j7>QBj@_C;!lJlY@Vh3&rY6
zUMM*-%Sw=5T!0Lb`@Xu8Rw01sU>A<O{W|u&^%{~58f02Yk+deJ?qQ@-^qwhVWD}%U
zq9*OBu!*9VEK&%GykixsqPJ05ht0KqWJfxo7+pYV?*wE$<0wt^A&=H0(?FrNM^PGR
zmP$D`kn%qI;zR5^z8@AZNzdVj*&D*v9ZPVBgVM^Wu_1~Bhxg&T@4m#9cdw#<ehQt_
zL%90!HGKQ0&+)6@-o?#3w{YjP2e|k7L)_!-ebU|+4}U3<--lm3z%3Hr?fYNh_MI>A
z&U-g;{LHJ^y0TNM9@aB9N{ic%M$%rSyAPhuKA1cDVH%l%ZTBJAPP_`;$qTf%uOfBZ
zP897u3ONUB`j&kt+<z9vH*TQ%(wj(M-i^F1i^v-sLSbVQWF%At|EFrVlabK@1w~HT
z;6NGaywGJwxu{vv;KuUy3EX`12rgeDbqqD2!cD8{x1clXq$MPo>)A<p-7;uN(#t4b
zrMzyrX~@n>f}y$?5mya-41E1<F*HSjsA5nP*0ZRTw-h{a;6Fb<ADcIm3Z8fpk5lM=
z@{^y^?yPP+{?F9!V)N?iN}t%?APPZ0`|Pt)smB*ze362=9?2=GlHf^c-({6h8)#*9
zv@oq*@HE7s(3((d_fqsMaQdTg1vq$`+R-~aiIJ%V^o`D7&!N+Zv<+bUfioCk5E)%M
zh=Uj2gNOFf8fu5Xc>o?#lb&>?u5Xa$?t}m?9K3)@3e*1AKgK7&`4=o5xrk8P5RCR9
zG-iIsHFmlf6J&H3rD`k6bo|cig0!C$-f*|HL0PQe&L4jl2VTF6?Wf+w{J~4udHO91
z?^TSioIu;;3Wl~HWxt$5ZMX+zCO^{34Je?HR@M5@HMxT{-Yg{=6&s7wN)(WroM;`{
zg7fd*#pKRY@U~8(b!;d5(j0X5800z|((`C(le3VLo`bb7C*kF!RIH=b-IP@*wWron
z;9lA&)dv=}N?v?<J&ElFi0Wt0u6Y^H@%iUzV_)L?)~wqkiQN-oN`eIb%yX+JEyOmI
zM`R}iPuz-)=TAQU6ms%J`A?0ssFhzR63ohxSCEg~{5<Kt=pUlhTdX9r5JfLS+Vdk2
z_QNei&r{EFS%+HMS4+qYTeJ?Yrh52W9SC)}c^zB`HQN!Q_=Vzih&9!srPYGIUOQ$c
z{TLgzqA6O6&K4O4y5tnR3QP>iF*~ln;=BPnmu%R(62y`HUAT66JKns!8z&A-;f?dl
zIJ~n5vx5#iR;FZUR#l=d;DE1{irg8cRp>&`P!vPMyx$XrQjrgZJOf>wUM%k(g_%g2
zQBi=bDjCtd2x&z*C=%IF^a@lMsxY-{0jJ)74Z)FaWYAg_a8P>2CZSGr;6=HPf@Z=7
zM!Zde^2)0zdMcz)_|jxrqzloL*Wks98az{7hnL+^lnl+Fl&xZL7U}FLo6P}a)iuD-
zK8Di9KIHqu*mi0ku6=S1{adC<ICeG<9JE2xJGR5<bwfuwFp~7Pu581#cV5Hdp#w;a
zjbLJB0pI-NxA@JUzro$FKB4H{#@+j0;@<tQap&Gc+_`@rcfR->w?8BOeRdC@+<$;i
zKKl}%+<Aa^-nofmr_NyuNl%Dg_vi@P`g_sT-G#>P9@?f(*xR~LL-Es%Ou)2#1+|Bc
zLQhLvbMh=qr(T8O&{<^Dn&mGag!0s7RFd@acI-yc)GYE_+ffpr;Q4$cIuf{(wt$o;
zL{COro6p-!vEm+YBu%vA=!q44`1Vm8JUWZ2Mhi-Odi1vVFx3@?(Z%4SFGGs9niO4u
z)bes{&dEZ0PCD&K7HqXD_-rISPc7!A24NEWE=lkCw6(dkw}oXD{JD%&n1z?tzQBR-
z^Z#766?^lfGq%5wwl+N4-XPk46+KbzQB=cEN=lM8HD^-rq_poMHI%Uy2D*Xv-eH(*
z3|hQzbvM8hN=Qis=C|)5*=@z#(k?VHkO+GlZRwFVB!}Aukr<ePuVoNT1G5ZxVd*4>
z$rVPq+6;|I7;CUeTRUo8QAB#CFn8p2%pJXo*FXIZ_kQ>*ntCUt*O}-XDm7;4YMm&r
zHc6YE#e0evgv+#csI3%F3SVMs4`P#hFw5U_<C{O@G=*y6;CZx;FQIE@7or36aJ7s;
z?Mgsi7e;ME7pmB2<rGe}HH<1V@7DxLd2}@SN@=dm-#UmZH}2!e<qzR+pJLxUjShZ>
zw|N}#o*C&ZZ$8J$MiQP#w=Yt0tzEwfn=^7(tt`cF&H7|KPcht>oQ7n6$D$_2DqU0b
zD1MJ}9g4*YkyTluJBr;Sx)aw-%-ubjzZ0S-9V`6#-y=0W4f3i=iS$J3Cn=9zIr{_!
zuT<Du(q39x+Uh)>KY+%l*sB^wAmBqF<ma{Yz!`GGpCA}@htSaB;Wcx>o2Vz@S^0Q9
zf(aX<6u=1Yw{?3kIuXP0h#y@YW{eEg)3!P>+;70-h>2pSCB<p5eZff4tCZT7X)Ttw
z*syzh7&qSDi90ur;N$lX;jN2Xv2!knIojdJ%4k_jRWei?Rg82sFnEn<8H!*Rz3)(e
z0P&cIqF0Ri+A>T}#W1_2m9Nhvs^=h`LYG>Whvb5E<jL|-T~h_UO@saC_G9n)gD}NC
zsPu7Qn)GPex(#wh>zBj|E~#!kMQ($vnoWtMifW3UiUU!NbXs2_dYjq7lB_|b#5$1H
z){hKEbj{=vvi+?jz#x)sAt?K&k;#r;Gd+iMq_yK$kHg*30+GJT;0a=Q>k>lU-4r`^
zvTzKBmKF>yO=Ea*3N6z!h>wk7{*@&>{KG?h`G;@t-rZZc_4%h1wfngH#doy2-{RK8
zhxp`|_i^vRJ$&)yeNrC<k@R=t);(N&;~nfibPQ9AOXw4Gdcy-~>Fq{yX9pVjS<#kO
z1e#mnXla9~y#t2+0ciP|%83b7&&)zKvj9CUS_#FxbbbZpJNKht*B%rslJffcq>~qN
zA8Ch{R^h0l=+)8UT2M**S;n7p-7ciYT2VPPimn}VIC<p&E?(S;{+R@FBXy9I$j7@v
z7;SQ+lD2h&NsSk3^mtxbjU<^I8)-vQb2A}Vl)+&)!e!IbK7`TLl|X5Q$d6cpToPV3
zMNi~O%PlBCYRV=&`}7m|1qZ_;(i2%#M0+&%BW+&&55W_~yoKNi%6kDDH*S!M(TF-1
zc?DuKaycp#Dk!x^ctdeX^lT(_XOIJ?trt7?A4A9B1m>3az~+y@6>5amY=^$q0Z|LL
z+DN<PXh7T091N}qj0}?20IA*{fH&F-gT)O^jSXcAvD4oUvm=NaPaM++FJoZWIV{qK
zKKSFmVSX2bk(GkP*IJykVu2VM+xwxdbwgpaONl`9<p#+0b~s|)h)*mdF~1j+6uQ^$
zevR!XFJpS&2|9;mScE&N4?|wt0DF8;nin*B+MubALCMGD20sNffQGg?$n*hZ%gnF_
zJ8|RwZ*la}hluvhqite04qd*D{)NM|eZvqbr7}uXQZj+{$r)10C}~R&X-#Z1*+9Gd
z<eGJo;JqNWoV>h_fhA2!%e02UMl?a-q5zZFe<!NZi6$uU(YBOTvHLl%oj7eF&Q1S}
z0qt?=zNnEaQoak(%P%aD5(kKEDY@cUBJHhc84QRbbyPzzh)^JiSU7@cC?t_ykoGnZ
z<Nw?oMq^JCy2e|1&7x@T53x0%smF&{hYL+zUUUtGF)$v-;Aj++GffyDZ$N*a6>~E_
z>{tq6eAtYs5ep^;Yp^(L!GT>KY?-QNQ%VB3LYsT~Xd5n{8^xy|y@FfstzgSIU*Ds^
cXh#YDAG?u@-rPidr2qf`07*qoM6N<$g2t{?CjbBd

diff --git a/docs/source/tutorial_hello_world.rst b/docs/tutorial_hello_world.rst
similarity index 100%
rename from docs/source/tutorial_hello_world.rst
rename to docs/tutorial_hello_world.rst
-- 
GitLab


From 88d474323b1c5e9760ce22e504344de4600a42d4 Mon Sep 17 00:00:00 2001
From: Sam Wu <sam.wu2@amd.com>
Date: Mon, 27 Mar 2023 17:14:36 -0600
Subject: [PATCH 05/71] Separate bibtex requirement from rocm-docs-core (#656)

* separate bibtex requirement from rocm-docs-core

* point requirements to source rocm-docs-core repo
---
 docs/.sphinx/requirements.in  |  1 +
 docs/.sphinx/requirements.txt | 18 ++++++++++++++++++
 docs/conf.py                  |  5 +++--
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/docs/.sphinx/requirements.in b/docs/.sphinx/requirements.in
index 2dfc7b076..36a9a4577 100644
--- a/docs/.sphinx/requirements.in
+++ b/docs/.sphinx/requirements.in
@@ -1 +1,2 @@
 git+https://github.com/RadeonOpenCompute/rocm-docs-core.git
+sphinxcontrib-bibtex==2.5.0
diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt
index e2b793590..8618920ea 100644
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -46,9 +46,11 @@ docutils==0.16
     # via
     #   breathe
     #   myst-parser
+    #   pybtex-docutils
     #   pydata-sphinx-theme
     #   rocm-docs-core
     #   sphinx
+    #   sphinxcontrib-bibtex
 executing==1.2.0
     # via stack-data
 fastjsonschema==2.16.3
@@ -94,6 +96,8 @@ jupyter-core==5.3.0
     #   ipykernel
     #   jupyter-client
     #   nbformat
+latexcodec==2.0.1
+    # via pybtex
 linkify-it-py==1.0.3
     # via myst-parser
 markdown-it-py==2.2.0
@@ -150,6 +154,12 @@ ptyprocess==0.7.0
     # via pexpect
 pure-eval==0.2.2
     # via stack-data
+pybtex==0.24.0
+    # via
+    #   pybtex-docutils
+    #   sphinxcontrib-bibtex
+pybtex-docutils==1.0.2
+    # via sphinxcontrib-bibtex
 pycparser==2.21
     # via cffi
 pydata-sphinx-theme==0.13.1
@@ -175,6 +185,7 @@ pyyaml==6.0
     #   jupyter-cache
     #   myst-nb
     #   myst-parser
+    #   pybtex
     #   sphinx-external-toc
 pyzmq==25.0.2
     # via
@@ -189,6 +200,8 @@ rocm-docs-core @ git+https://github.com/RadeonOpenCompute/rocm-docs-core.git
 six==1.16.0
     # via
     #   asttokens
+    #   latexcodec
+    #   pybtex
     #   python-dateutil
 smmap==5.0.0
     # via gitdb
@@ -208,6 +221,7 @@ sphinx==4.3.1
     #   sphinx-design
     #   sphinx-external-toc
     #   sphinx-notfound-page
+    #   sphinxcontrib-bibtex
 sphinx-book-theme==1.0.0rc2
     # via rocm-docs-core
 sphinx-copybutton==0.5.1
@@ -220,6 +234,10 @@ sphinx-notfound-page==0.8.3
     # via rocm-docs-core
 sphinxcontrib-applehelp==1.0.4
     # via sphinx
+sphinxcontrib-bibtex==2.5.0
+    # via
+    #   -r requirements.in
+    #   rocm-docs-core
 sphinxcontrib-devhelp==1.0.2
     # via sphinx
 sphinxcontrib-htmlhelp==2.0.1
diff --git a/docs/conf.py b/docs/conf.py
index 9b43b7155..3ec81ee9d 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -18,7 +18,8 @@ mathjax3_config = {
     }
 }
 
-bibtex_bibfiles = ['refs.bib']
-
 for sphinx_var in ROCmDocs.SPHINX_VARS:
     globals()[sphinx_var] = getattr(docs_core, sphinx_var)
+
+extensions += ['sphinxcontrib.bibtex']
+bibtex_bibfiles = ['refs.bib']
-- 
GitLab


From 4e097ad283d5a2d977501c61d0f8c3081dfa35f6 Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Thu, 30 Mar 2023 03:07:33 +0800
Subject: [PATCH 06/71] Add CMake Option "USE_OPT_NAVI3X" (#647)

* Add CMake Option "USE_OPT_NAVI3X"

* remove navi3x opt compile option from cmake script
---
 CMakeLists.txt         | 7 +++++++
 script/cmake-ck-dev.sh | 4 ++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f861e3020..c9fb6b455 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,7 @@ include(TargetFlags)
 list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip)
 
 option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
+option(USE_OPT_NAVI3X, "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF)
 
 if(USE_BITINT_EXTENSION_INT4)
     add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
@@ -29,6 +30,12 @@ if(USE_BITINT_EXTENSION_INT4)
     message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
 endif()
 
+if(USE_OPT_NAVI3X)
+    add_compile_options(-mcumode)
+    add_compile_options(-mno-wavefrontsize64)
+    message("CK compiled with USE_OPT_NAVI3X set to ${USE_OPT_NAVI3X}")
+endif()
+
 ## Threads
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index 375ff4931..8f462237f 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -8,8 +8,8 @@ MY_PROJECT_SOURCE=$1
 cmake                                                                                             \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
--D CMAKE_CXX_FLAGS="-std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -mcumode                    \
--mno-wavefrontsize64 -Wno-gnu-line-marker -save-temps=$PWD"                                       \
+-D CMAKE_CXX_FLAGS="-std=c++17 -O3 -ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker      \
+-save-temps=$PWD"                                                                                 \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
 -D GPU_TARGETS="gfx908;gfx90a"                                                                    \
-- 
GitLab


From 389e84a83b1cc3a0ebfe20f7932ab1928f93bad1 Mon Sep 17 00:00:00 2001
From: rocking5566 <ChunYu.Lai@amd.com>
Date: Thu, 30 Mar 2023 03:50:23 +0800
Subject: [PATCH 07/71] Conv + quantization + tanh  (#645)

* Rename file. Prepare to support another activation

* Add comment for quantization

* Extract out_elementop

* Add tanh example

* Add conv + bias + tanh quantization instance

* Add missing parameter

* Refine cmake

* Add external api and client example

* Extract variable in example

* Fix the comment

---------

Co-authored-by: zjing14 <zhangjing14@gmail.com>
---
 client_example/09_quantization/CMakeLists.txt |   6 +
 ...2d_fwd_bias_relu_perlayer_quantization.cpp |  99 +++++----
 ..._fwd_bias_tanh_perchannel_quantization.cpp | 209 ++++++++++++++++++
 ...2d_fwd_bias_tanh_perlayer_quantization.cpp | 201 +++++++++++++++++
 .../conv2d_fwd_perlayer_quantization.cpp      |  99 +++++----
 .../40_conv2d_fwd_quantization/CMakeLists.txt |   5 +
 ...bias_relu_perchannel_quantization_int8.cpp |   8 +-
 ...l_bias_relu_perlayer_quantization_int8.cpp |   9 +-
 ...bias_tanh_perchannel_quantization_int8.cpp |  87 ++++++++
 ...l_bias_tanh_perlayer_quantization_int8.cpp |  85 +++++++
 ...2d_fwd_dl_perchannel_quantization_int8.cpp |   6 +-
 ...nv2d_fwd_dl_perlayer_quantization_int8.cpp |   7 +-
 ...bias_relu_perchannel_quantization_int8.cpp |   8 +-
 ...l_bias_relu_perlayer_quantization_int8.cpp |   9 +-
 ...d_fwd_xdl_perchannel_quantization_int8.cpp |   6 +-
 ...v2d_fwd_xdl_perlayer_quantization_int8.cpp |   7 +-
 ..._bias_perchannel_quantization_example.inc} |   3 +-
 ...wd_bias_perlayer_quantization_example.inc} |   3 +-
 ...2d_fwd_perchannel_quantization_example.inc |   3 +-
 ...nv2d_fwd_perlayer_quantization_example.inc |   3 +-
 .../gpu/element/quantization_operation.hpp    | 131 ++++++++++-
 .../element/unary_element_wise_operation.hpp  |  13 ++
 include/ck/utility/math_v2.hpp                |  18 ++
 .../device_operation_instance_factory.hpp     |   9 +
 ...n_bias_forward_perchannel_quantization.hpp |  94 ++++++++
 ...ion_bias_forward_perlayer_quantization.hpp |  92 ++++++++
 .../conv2d_fwd/conv2d_quantization_common.hpp |   9 +
 ..._perchannel_quantization_int8_instance.cpp |  36 +++
 ...as_perlayer_quantization_int8_instance.cpp |  37 ++++
 ..._perchannel_quantization_int8_instance.cpp |  35 +++
 ...as_perlayer_quantization_int8_instance.cpp |  37 ++++
 31 files changed, 1252 insertions(+), 122 deletions(-)
 create mode 100644 client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
 create mode 100644 client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
 create mode 100644 example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
 create mode 100644 example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
 rename example/40_conv2d_fwd_quantization/{run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc => run_conv2d_fwd_bias_perchannel_quantization_example.inc} (98%)
 rename example/40_conv2d_fwd_quantization/{run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc => run_conv2d_fwd_bias_perlayer_quantization_example.inc} (98%)

diff --git a/client_example/09_quantization/CMakeLists.txt b/client_example/09_quantization/CMakeLists.txt
index a4dd80cd3..2b7d6fc80 100644
--- a/client_example/09_quantization/CMakeLists.txt
+++ b/client_example/09_quantization/CMakeLists.txt
@@ -1,6 +1,12 @@
+add_executable(client_conv2d_fwd_bias_tanh_perchannel_quantization conv2d_fwd_bias_tanh_perchannel_quantization.cpp)
+target_link_libraries(client_conv2d_fwd_bias_tanh_perchannel_quantization PRIVATE composable_kernel::device_operations)
+
 add_executable(client_conv2d_fwd_bias_relu_perchannel_quantization conv2d_fwd_bias_relu_perchannel_quantization.cpp)
 target_link_libraries(client_conv2d_fwd_bias_relu_perchannel_quantization PRIVATE composable_kernel::device_operations)
 
+add_executable(client_conv2d_fwd_bias_tanh_perlayer_quantization conv2d_fwd_bias_tanh_perlayer_quantization.cpp)
+target_link_libraries(client_conv2d_fwd_bias_tanh_perlayer_quantization PRIVATE composable_kernel::device_operations)
+
 add_executable(client_conv2d_fwd_bias_relu_perlayer_quantization conv2d_fwd_bias_relu_perlayer_quantization.cpp)
 target_link_libraries(client_conv2d_fwd_bias_relu_perlayer_quantization PRIVATE composable_kernel::device_operations)
 
diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
index 7cbbd2832..b8e6a493e 100644
--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
@@ -26,15 +26,16 @@ using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clam
 
 static constexpr ck::index_t NumDimSpatial = 2;
 static constexpr ck::index_t G             = 1;
-static constexpr ck::index_t N             = 4;   // batch size
-static constexpr ck::index_t K             = 64;  // output channel
-static constexpr ck::index_t C             = 192; // input channel
-static constexpr ck::index_t Y             = 3;   // filter H
-static constexpr ck::index_t X             = 3;   // filter W
-static constexpr ck::index_t Hi            = 71;  // input H
-static constexpr ck::index_t Wi            = 71;  // input W
-static constexpr ck::index_t Ho            = 36;  // output H
-static constexpr ck::index_t Wo            = 36;  // output W
+static constexpr ck::index_t N             = 4;    // batch size
+static constexpr ck::index_t K             = 64;   // output channel
+static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t Y             = 3;    // filter H
+static constexpr ck::index_t X             = 3;    // filter W
+static constexpr ck::index_t Hi            = 71;   // input H
+static constexpr ck::index_t Wi            = 71;   // input W
+static constexpr ck::index_t Ho            = 36;   // output H
+static constexpr ck::index_t Wo            = 36;   // output W
+static constexpr float requant_scale       = 0.5f; // requantize qAcc to qz
 
 struct SimpleDeviceMem
 {
@@ -102,26 +103,27 @@ int main(int argc, char* argv[])
 
     for(int i = 0; i < op_ptrs.size(); ++i)
     {
-        auto& op_ptr      = op_ptrs[i];
-        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
-                                                        wei.GetDeviceBuffer(),
-                                                        {bias.GetDeviceBuffer()},
-                                                        out.GetDeviceBuffer(),
-                                                        in_lengths,
-                                                        in_strides,
-                                                        weight_lengths,
-                                                        weight_strides,
-                                                        {bias_lengths},
-                                                        {bias_strides},
-                                                        out_lengths,
-                                                        out_strides,
-                                                        conv_strides,
-                                                        conv_dilations,
-                                                        in_left_pad,
-                                                        in_right_pad,
-                                                        PassThrough{},
-                                                        PassThrough{},
-                                                        OutElementOp{0.5f, ActivationOp{}});
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                        wei.GetDeviceBuffer(),
+                                        {bias.GetDeviceBuffer()},
+                                        out.GetDeviceBuffer(),
+                                        in_lengths,
+                                        in_strides,
+                                        weight_lengths,
+                                        weight_strides,
+                                        {bias_lengths},
+                                        {bias_strides},
+                                        out_lengths,
+                                        out_strides,
+                                        conv_strides,
+                                        conv_dilations,
+                                        in_left_pad,
+                                        in_right_pad,
+                                        PassThrough{},
+                                        PassThrough{},
+                                        OutElementOp{requant_scale, ActivationOp{}});
 
         auto invoker_ptr    = op_ptr->MakeInvokerPointer();
         std::string op_name = op_ptr->GetTypeString();
@@ -165,25 +167,26 @@ int main(int argc, char* argv[])
         auto& op_ptr = op_ptrs[best_op_id];
         std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
                   << std::endl;
-        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
-                                                        wei.GetDeviceBuffer(),
-                                                        {bias.GetDeviceBuffer()},
-                                                        out.GetDeviceBuffer(),
-                                                        in_lengths,
-                                                        in_strides,
-                                                        weight_lengths,
-                                                        weight_strides,
-                                                        {bias_lengths},
-                                                        {bias_strides},
-                                                        out_lengths,
-                                                        out_strides,
-                                                        conv_strides,
-                                                        conv_dilations,
-                                                        in_left_pad,
-                                                        in_right_pad,
-                                                        PassThrough{},
-                                                        PassThrough{},
-                                                        OutElementOp{0.5f, ActivationOp{}});
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                        wei.GetDeviceBuffer(),
+                                        {bias.GetDeviceBuffer()},
+                                        out.GetDeviceBuffer(),
+                                        in_lengths,
+                                        in_strides,
+                                        weight_lengths,
+                                        weight_strides,
+                                        {bias_lengths},
+                                        {bias_strides},
+                                        out_lengths,
+                                        out_strides,
+                                        conv_strides,
+                                        conv_dilations,
+                                        in_left_pad,
+                                        in_right_pad,
+                                        PassThrough{},
+                                        PassThrough{},
+                                        OutElementOp{requant_scale, ActivationOp{}});
 
         auto invoker_ptr = op_ptr->MakeInvokerPointer();
 
diff --git a/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
new file mode 100644
index 000000000..7a216f027
--- /dev/null
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType           = int8_t;
+using WeiDataType          = int8_t;
+using BiasDataType         = int32_t;
+using RequantScaleDataType = float;
+using OutDataType          = int8_t;
+
+using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
+using BiasLayout         = ck::tensor_layout::convolution::G_K;
+using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
+using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp       = ck::tensor_operation::element_wise::TanH;
+using OutElementOp =
+    ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 4;    // batch size
+static constexpr ck::index_t K             = 64;   // output channel
+static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t Y             = 3;    // filter H
+static constexpr ck::index_t X             = 3;    // filter W
+static constexpr ck::index_t Hi            = 71;   // input H
+static constexpr ck::index_t Wi            = 71;   // input W
+static constexpr ck::index_t Ho            = 36;   // output H
+static constexpr ck::index_t Wo            = 36;   // output W
+static constexpr float sz_inv              = 0.5f; // inverse of scale_z
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
+    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
+    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
+    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
+    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 2> in_left_pad{1, 1};
+    std::array<ck::index_t, 2> in_right_pad{1, 1};
+    std::array<ck::index_t, 2> conv_strides{2, 2};
+    std::array<ck::index_t, 2> conv_dilations{1, 1};
+
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+        NumDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout, RequantScaleLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        ck::Tuple<BiasDataType, RequantScaleDataType>,
+        OutDataType,
+        PassThrough,
+        PassThrough,
+        OutElementOp>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                        wei.GetDeviceBuffer(),
+                                        {bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
+                                        out.GetDeviceBuffer(),
+                                        in_lengths,
+                                        in_strides,
+                                        weight_lengths,
+                                        weight_strides,
+                                        {bias_lengths, requant_scale_lengths},
+                                        {bias_strides, requant_scale_strides},
+                                        out_lengths,
+                                        out_strides,
+                                        conv_strides,
+                                        conv_dilations,
+                                        in_left_pad,
+                                        in_right_pad,
+                                        PassThrough{},
+                                        PassThrough{},
+                                        OutElementOp{sz_inv, ActivationOp{}});
+
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes =
+                G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
+                G * sizeof(BiasDataType) * K + G * sizeof(RequantScaleDataType) * K +
+                G * sizeof(OutDataType) * N * Ho * Wo * K;
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    // run the best intance
+    if(best_op_id != -1)
+    {
+        std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+                  << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                        wei.GetDeviceBuffer(),
+                                        {bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
+                                        out.GetDeviceBuffer(),
+                                        in_lengths,
+                                        in_strides,
+                                        weight_lengths,
+                                        weight_strides,
+                                        {bias_lengths, requant_scale_lengths},
+                                        {bias_strides, requant_scale_strides},
+                                        out_lengths,
+                                        out_strides,
+                                        conv_strides,
+                                        conv_dilations,
+                                        in_left_pad,
+                                        in_right_pad,
+                                        PassThrough{},
+                                        PassThrough{},
+                                        OutElementOp{sz_inv, ActivationOp{}});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
new file mode 100644
index 000000000..7637f5c78
--- /dev/null
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType   = int8_t;
+using WeiDataType  = int8_t;
+using BiasDataType = int32_t;
+using OutDataType  = int8_t;
+
+using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
+using BiasLayout   = ck::tensor_layout::convolution::G_K;
+using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::TanH;
+using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 4;    // batch size
+static constexpr ck::index_t K             = 64;   // output channel
+static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t Y             = 3;    // filter H
+static constexpr ck::index_t X             = 3;    // filter W
+static constexpr ck::index_t Hi            = 71;   // input H
+static constexpr ck::index_t Wi            = 71;   // input W
+static constexpr ck::index_t Ho            = 36;   // output H
+static constexpr ck::index_t Wo            = 36;   // output W
+static constexpr float sacc                = 0.5f; //  scale of acc
+static constexpr float sz_inv              = 0.5f; // inverse of scale_z
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
+    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
+    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
+    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 2> in_left_pad{1, 1};
+    std::array<ck::index_t, 2> in_right_pad{1, 1};
+    std::array<ck::index_t, 2> conv_strides{2, 2};
+    std::array<ck::index_t, 2> conv_dilations{1, 1};
+
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                                    InLayout,
+                                                                    WeiLayout,
+                                                                    ck::Tuple<BiasLayout>,
+                                                                    OutLayout,
+                                                                    InDataType,
+                                                                    WeiDataType,
+                                                                    ck::Tuple<BiasDataType>,
+                                                                    OutDataType,
+                                                                    PassThrough,
+                                                                    PassThrough,
+                                                                    OutElementOp>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {bias.GetDeviceBuffer()},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        weight_lengths,
+                                                        weight_strides,
+                                                        {bias_lengths},
+                                                        {bias_strides},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        conv_strides,
+                                                        conv_dilations,
+                                                        in_left_pad,
+                                                        in_right_pad,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        OutElementOp{sacc, sz_inv, ActivationOp{}});
+
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes =
+                G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
+                G * sizeof(BiasDataType) * K + G * sizeof(OutDataType) * N * Ho * Wo * K;
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    // run the best intance
+    if(best_op_id != -1)
+    {
+        std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+                  << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {bias.GetDeviceBuffer()},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        weight_lengths,
+                                                        weight_strides,
+                                                        {bias_lengths},
+                                                        {bias_strides},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        conv_strides,
+                                                        conv_dilations,
+                                                        in_left_pad,
+                                                        in_right_pad,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        OutElementOp{sacc, sz_inv, ActivationOp{}});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
index daeff4ff4..f7c46a95f 100644
--- a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
@@ -24,15 +24,16 @@ using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Ac
 
 static constexpr ck::index_t NumDimSpatial = 2;
 static constexpr ck::index_t G             = 1;
-static constexpr ck::index_t N             = 4;   // batch size
-static constexpr ck::index_t K             = 64;  // output channel
-static constexpr ck::index_t C             = 192; // input channel
-static constexpr ck::index_t Y             = 3;   // filter H
-static constexpr ck::index_t X             = 3;   // filter W
-static constexpr ck::index_t Hi            = 71;  // input H
-static constexpr ck::index_t Wi            = 71;  // input W
-static constexpr ck::index_t Ho            = 36;  // output H
-static constexpr ck::index_t Wo            = 36;  // output W
+static constexpr ck::index_t N             = 4;    // batch size
+static constexpr ck::index_t K             = 64;   // output channel
+static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t Y             = 3;    // filter H
+static constexpr ck::index_t X             = 3;    // filter W
+static constexpr ck::index_t Hi            = 71;   // input H
+static constexpr ck::index_t Wi            = 71;   // input W
+static constexpr ck::index_t Ho            = 36;   // output H
+static constexpr ck::index_t Wo            = 36;   // output W
+static constexpr float requant_scale       = 0.5f; // requantize qAcc to qY
 
 struct SimpleDeviceMem
 {
@@ -96,26 +97,27 @@ int main(int argc, char* argv[])
 
     for(int i = 0; i < op_ptrs.size(); ++i)
     {
-        auto& op_ptr      = op_ptrs[i];
-        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
-                                                        wei.GetDeviceBuffer(),
-                                                        {},
-                                                        out.GetDeviceBuffer(),
-                                                        in_lengths,
-                                                        in_strides,
-                                                        weight_lengths,
-                                                        weight_strides,
-                                                        {},
-                                                        {},
-                                                        out_lengths,
-                                                        out_strides,
-                                                        conv_strides,
-                                                        conv_dilations,
-                                                        in_left_pad,
-                                                        in_right_pad,
-                                                        PassThrough{},
-                                                        PassThrough{},
-                                                        OutElementOp{0.5f, ActivationOp{}});
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                        wei.GetDeviceBuffer(),
+                                        {},
+                                        out.GetDeviceBuffer(),
+                                        in_lengths,
+                                        in_strides,
+                                        weight_lengths,
+                                        weight_strides,
+                                        {},
+                                        {},
+                                        out_lengths,
+                                        out_strides,
+                                        conv_strides,
+                                        conv_dilations,
+                                        in_left_pad,
+                                        in_right_pad,
+                                        PassThrough{},
+                                        PassThrough{},
+                                        OutElementOp{requant_scale, ActivationOp{}});
 
         auto invoker_ptr    = op_ptr->MakeInvokerPointer();
         std::string op_name = op_ptr->GetTypeString();
@@ -158,25 +160,26 @@ int main(int argc, char* argv[])
         auto& op_ptr = op_ptrs[best_op_id];
         std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
                   << std::endl;
-        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
-                                                        wei.GetDeviceBuffer(),
-                                                        {},
-                                                        out.GetDeviceBuffer(),
-                                                        in_lengths,
-                                                        in_strides,
-                                                        weight_lengths,
-                                                        weight_strides,
-                                                        {},
-                                                        {},
-                                                        out_lengths,
-                                                        out_strides,
-                                                        conv_strides,
-                                                        conv_dilations,
-                                                        in_left_pad,
-                                                        in_right_pad,
-                                                        PassThrough{},
-                                                        PassThrough{},
-                                                        OutElementOp{0.5f, ActivationOp{}});
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                        wei.GetDeviceBuffer(),
+                                        {},
+                                        out.GetDeviceBuffer(),
+                                        in_lengths,
+                                        in_strides,
+                                        weight_lengths,
+                                        weight_strides,
+                                        {},
+                                        {},
+                                        out_lengths,
+                                        out_strides,
+                                        conv_strides,
+                                        conv_dilations,
+                                        in_left_pad,
+                                        in_right_pad,
+                                        PassThrough{},
+                                        PassThrough{},
+                                        OutElementOp{requant_scale, ActivationOp{}});
 
         auto invoker_ptr = op_ptr->MakeInvokerPointer();
 
diff --git a/example/40_conv2d_fwd_quantization/CMakeLists.txt b/example/40_conv2d_fwd_quantization/CMakeLists.txt
index c3540d6ee..0a314cd74 100644
--- a/example/40_conv2d_fwd_quantization/CMakeLists.txt
+++ b/example/40_conv2d_fwd_quantization/CMakeLists.txt
@@ -14,3 +14,8 @@ add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_in
 add_example_executable(example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp)
 add_example_executable(example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp)
 
+# Conv + bias + tanh perlayer quantization
+add_example_executable(example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp)
+
+# Conv + bias + tanh perchannel quantization
+add_example_executable(example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp)
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
index df10e8039..5c445d9c5 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
@@ -76,6 +76,10 @@ using DeviceGroupedConvNDFwdInstance =
         5,                   // CThreadTransferSrcDstVectorDim
         4>;                  // CThreadTransferDstScalarPerVector
 
-#include "run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc"
+#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
 
-int main() { run_conv2d_fwd_bias_relu_perchannel_quantization_example(); };
+int main()
+{
+    const auto out_element_op = OutElementOp{ActivationOp{}};
+    run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
+};
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
index 18f9197b9..0ff85f008 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
@@ -74,6 +74,11 @@ using DeviceGroupedConvNDFwdInstance =
         5,                   // CThreadTransferSrcDstVectorDim
         4>;                  // CThreadTransferDstScalarPerVector
 
-#include "run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc"
+#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
 
-int main() { run_conv2d_fwd_bias_relu_perlayer_quantization_example(); }
+int main()
+{
+    float requant_scale       = 0.5f;
+    const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
+    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
+}
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
new file mode 100644
index 000000000..f8f996d17
--- /dev/null
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+
+using InDataType           = int8_t;
+using WeiDataType          = int8_t;
+using BiasDataType         = int32_t;
+using RequantScaleDataType = float;
+using AccDataType          = int32_t;
+using OutDataType          = int8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::TanH;
+using OutElementOp =
+    ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename BiasLayout,
+          typename RequantScaleLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<
+        NDimSpatial,
+        InDataType,
+        WeiDataType,
+        ck::Tuple<BiasDataType, RequantScaleDataType>,
+        OutDataType,
+        AccDataType,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout, RequantScaleLayout>,
+        OutLayout,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,            // ConvForwardSpecialization
+        GemmSpec,            // GemmSpecialization
+        256,                 // BlockSize
+        128,                 // MPerBlock
+        128,                 // NPerBlock
+        16,                  // K0PerBlock
+        4,                   // K1
+        4,                   // M1PerThread
+        4,                   // N1PerThread
+        1,                   // KPerThread
+        S<8, 2>,             // M1N1ThreadClusterM1Xs
+        S<8, 2>,             // M1N1ThreadClusterN1Xs
+        S<8, 1, 1, 4>,       // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
+        S<2, 1, 128, 1>,     // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
+        S<1, 2, 0, 3>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 2, 0, 3>,       // ABlockTransferSrcAccessOrder
+        S<4, 1, 1, 4>,       // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
+        S<1, 2, 0, 3>,       // ABlockTransferSrcVectorTensorContiguousDimOrder
+        S<1, 1, 1, 4>,       // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
+        S<8, 1, 1, 4>,       // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
+        S<2, 1, 128, 1>,     // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
+        S<1, 2, 0, 3>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 2, 0, 3>,       // BBlockTransferSrcAccessOrder
+        S<4, 1, 1, 4>,       // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
+        S<1, 2, 0, 3>,       // BBlockTransferSrcVectorTensorContiguousDimOrder
+        S<1, 1, 1, 4>,       // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
+        S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
+        5,                   // CThreadTransferSrcDstVectorDim
+        4>;                  // CThreadTransferDstScalarPerVector
+
+#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
+
+int main()
+{
+    float scale_z_inv         = 0.5f;
+    const auto out_element_op = OutElementOp{scale_z_inv, ActivationOp{}};
+    run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
+};
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
new file mode 100644
index 000000000..3b25fec0c
--- /dev/null
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+
+using InDataType   = int8_t;
+using WeiDataType  = int8_t;
+using BiasDataType = int32_t;
+using AccDataType  = int32_t;
+using OutDataType  = int8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::TanH;
+using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename BiasLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<
+        NDimSpatial,
+        InDataType,
+        WeiDataType,
+        ck::Tuple<BiasDataType>,
+        OutDataType,
+        AccDataType,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout>,
+        OutLayout,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,            // ConvForwardSpecialization
+        GemmSpec,            // GemmSpecialization
+        256,                 // BlockSize
+        128,                 // MPerBlock
+        128,                 // NPerBlock
+        16,                  // K0PerBlock
+        4,                   // K1
+        4,                   // M1PerThread
+        4,                   // N1PerThread
+        1,                   // KPerThread
+        S<8, 2>,             // M1N1ThreadClusterM1Xs
+        S<8, 2>,             // M1N1ThreadClusterN1Xs
+        S<8, 1, 1, 4>,       // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
+        S<2, 1, 128, 1>,     // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
+        S<1, 2, 0, 3>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 2, 0, 3>,       // ABlockTransferSrcAccessOrder
+        S<4, 1, 1, 4>,       // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
+        S<1, 2, 0, 3>,       // ABlockTransferSrcVectorTensorContiguousDimOrder
+        S<1, 1, 1, 4>,       // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
+        S<8, 1, 1, 4>,       // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
+        S<2, 1, 128, 1>,     // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
+        S<1, 2, 0, 3>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 2, 0, 3>,       // BBlockTransferSrcAccessOrder
+        S<4, 1, 1, 4>,       // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
+        S<1, 2, 0, 3>,       // BBlockTransferSrcVectorTensorContiguousDimOrder
+        S<1, 1, 1, 4>,       // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
+        S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
+        5,                   // CThreadTransferSrcDstVectorDim
+        4>;                  // CThreadTransferDstScalarPerVector
+
+#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
+
+int main()
+{
+    float scale_acc           = 0.5f;
+    float scale_z_inv         = 0.5f;
+    const auto out_element_op = OutElementOp{scale_z_inv, scale_acc, ActivationOp{}};
+    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
+}
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
index afff7f8b6..a98a1e240 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
@@ -76,4 +76,8 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_perchannel_quantization_example.inc"
 
-int main() { run_conv2d_fwd_perchannel_quantization_example(); }
+int main()
+{
+    const auto out_element_op = OutElementOp{ActivationOp{}};
+    run_conv2d_fwd_perchannel_quantization_example(out_element_op);
+}
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
index a38fe2a6c..262594d58 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
@@ -71,4 +71,9 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_perlayer_quantization_example.inc"
 
-int main() { run_conv2d_fwd_perlayer_quantization_example(); }
+int main()
+{
+    float requant_scale       = 0.5f;
+    const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
+    run_conv2d_fwd_perlayer_quantization_example(out_element_op);
+}
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
index ba6990d93..6b2205505 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
@@ -80,6 +80,10 @@ using DeviceGroupedConvNDFwdInstance =
         S<1, 64, 1, 4>,
         8>;
 
-#include "run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc"
+#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
 
-int main() { run_conv2d_fwd_bias_relu_perchannel_quantization_example(); };
+int main()
+{
+    const auto out_element_op = OutElementOp{ActivationOp{}};
+    run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
+};
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
index 690d70e11..1ac867974 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
@@ -78,6 +78,11 @@ using DeviceGroupedConvNDFwdInstance =
         S<1, 64, 1, 4>,
         8>;
 
-#include "run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc"
+#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
 
-int main() { run_conv2d_fwd_bias_relu_perlayer_quantization_example(); }
+int main()
+{
+    float requant_scale       = 0.5f;
+    const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
+    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
+}
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
index dd755ff06..f28abe5eb 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
@@ -80,4 +80,8 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_perchannel_quantization_example.inc"
 
-int main() { run_conv2d_fwd_perchannel_quantization_example(); }
+int main()
+{
+    const auto out_element_op = OutElementOp{ActivationOp{}};
+    run_conv2d_fwd_perchannel_quantization_example(out_element_op);
+}
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
index 48617e477..f468e8adc 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
@@ -75,4 +75,9 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_perlayer_quantization_example.inc"
 
-int main() { run_conv2d_fwd_perlayer_quantization_example(); }
+int main()
+{
+    float requant_scale       = 0.5f;
+    const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
+    run_conv2d_fwd_perlayer_quantization_example(out_element_op);
+}
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
similarity index 98%
rename from example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc
rename to example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
index 822a1ed8b..1587c614d 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
@@ -167,7 +167,7 @@ bool run_grouped_conv_fwd(bool do_verification,
     return (pass ? 0 : 1);
 }
 
-int run_conv2d_fwd_bias_relu_perchannel_quantization_example()
+int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op)
 {
     bool do_verification           = true;
     bool time_kernel               = true;
@@ -189,7 +189,6 @@ int run_conv2d_fwd_bias_relu_perchannel_quantization_example()
 
     const auto in_element_op  = InElementOp{};
     const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{ActivationOp{}};
 
     using InLayout           = ck::tensor_layout::convolution::GNHWC;
     using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
similarity index 98%
rename from example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc
rename to example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
index 00cbaa09e..455e0804d 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
@@ -155,7 +155,7 @@ bool run_grouped_conv_fwd(bool do_verification,
     return (pass ? 0 : 1);
 }
 
-int run_conv2d_fwd_bias_relu_perlayer_quantization_example()
+int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op)
 {
     bool do_verification           = true;
     bool time_kernel               = true;
@@ -177,7 +177,6 @@ int run_conv2d_fwd_bias_relu_perlayer_quantization_example()
 
     const auto in_element_op  = InElementOp{};
     const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
 
     using InLayout   = ck::tensor_layout::convolution::GNHWC;
     using WeiLayout  = ck::tensor_layout::convolution::GKYXC;
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
index 2e0623028..8e75c2774 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
@@ -157,7 +157,7 @@ bool run_grouped_conv_fwd(bool do_verification,
     return (pass ? 0 : 1);
 }
 
-int run_conv2d_fwd_perchannel_quantization_example()
+int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op)
 {
     bool do_verification           = true;
     bool time_kernel               = true;
@@ -179,7 +179,6 @@ int run_conv2d_fwd_perchannel_quantization_example()
 
     const auto in_element_op  = InElementOp{};
     const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{ActivationOp{}};
 
     using InLayout           = ck::tensor_layout::convolution::GNHWC;
     using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
index aeccb30cf..926c033c5 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
@@ -139,7 +139,7 @@ bool run_grouped_conv_fwd(bool do_verification,
     return (pass ? 0 : 1);
 }
 
-int run_conv2d_fwd_perlayer_quantization_example()
+int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op)
 {
     bool do_verification           = true;
     bool time_kernel               = false;
@@ -161,7 +161,6 @@ int run_conv2d_fwd_perlayer_quantization_example()
 
     const auto in_element_op  = InElementOp{};
     const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
 
     using InLayout  = ck::tensor_layout::convolution::GNHWC;
     using WeiLayout = ck::tensor_layout::convolution::GKYXC;
diff --git a/include/ck/tensor_operation/gpu/element/quantization_operation.hpp b/include/ck/tensor_operation/gpu/element/quantization_operation.hpp
index 7ea09a222..fefa6c793 100644
--- a/include/ck/tensor_operation/gpu/element/quantization_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/quantization_operation.hpp
@@ -7,10 +7,30 @@ namespace ck {
 namespace tensor_operation {
 namespace element_wise {
 
+// Y = Sy * Qy
+// W = Sw * Qw
+// X = Sx * Qx
+// B = Sb * Qb = Sw * Sx * Qb
+// Where X, W, Y are float32, Qx, Qw, Qy are int8
+// Sx, Sw, Sy are scale of x, w, y (float32), which is calculated from quantization range
+// Qb is int32, scale of B is Sw * Sx for convenient
+
+// Y = W @ X, where @ is convolution or matrix multiplication
+// Sy * Qy = Sw * Qw @ Sx * Qx
+// Qy = [(Sw*Sx)/Sy] * Qw @ Qx
+
 // For Activation function which is piecewise linear function, such as relu, leaky relu ...etc
+// Activation(Sy * Qy) = Sy * Activation(Qy)
 template <typename Activation>
 struct Activation_Mul_Clamp
 {
+    // Convolution + Activation (piecewise linear function)
+    // If an activation is piecewise linear function, then Activation(Sy * Qy) = Sy * Activation(Qy)
+    // Z = Activation(Y) = Activation(W @ X)
+    // Sz * Qz = Activation(Sy * Qy)
+    // Qz = Sy / Sz * Activation(Qy) = (Sw * Sx / Sz) * Activation(Qw @ Qx)
+
+    // requantScale_ = Sw * Sx / Sz
     Activation_Mul_Clamp(float requantScale, Activation activationOp)
         : requantScale_(requantScale), activationOp_(activationOp)
     {
@@ -45,8 +65,39 @@ struct Activation_Mul_Clamp
     Activation activationOp_;
 };
 
+// For Activation function which is non piecewise linear function, such as TanH, Sigmoid ...etc
+// If an activation is not piecewise linear function
+// then Activation(Sy * Qy) != Sy * Activation(Qy)
+template <typename Activation>
+struct Mul_Activation_Mul_Clamp
+{
+    // Convolution + Activation (non piecewise linear function)
+    // Z = Activation(Y) = Activation(W @ X)
+    // Sz * Qz = Activation(Sy * Qy)
+    // Qz = S1 * Activation[Sacc * (Qw @ Qx)]
+    // Where S1 = 1 / Sz, Sacc = Sw * Sx
+    Mul_Activation_Mul_Clamp(float scale_z_inv, float scaleAcc, Activation activationOp)
+        : scale_z_inv_(scale_z_inv), scaleAcc_(scaleAcc), activationOp_(activationOp)
+    {
+    }
+
+    __host__ __device__ constexpr void operator()(int8_t& y, const int32_t& x) const
+    {
+        float y_fp32 = ck::type_convert<float>(x);
+        y_fp32       = scaleAcc_ * y_fp32;
+        activationOp_(y_fp32, y_fp32);
+        y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
+        y      = ck::type_convert<int8_t>(y_fp32);
+    }
+
+    float scale_z_inv_;
+    float scaleAcc_;
+    Activation activationOp_;
+};
+
 // Conv Perchannel quantization + Activation function which is piecewise linear function, such as
 // relu, leaky relu ...etc
+// Activation(Sy * Qy) = Sy * Activation(Qy)
 template <typename Activation>
 struct Activation_Mul2_Clamp
 {
@@ -76,9 +127,20 @@ struct Activation_Mul2_Clamp
 };
 
 // For Activation function which is piecewise linear function, such as relu, leaky relu ...etc
+// Activation(Sy * Qy) = Sy * Activation(Qy)
 template <typename Activation>
 struct Add_Activation_Mul_Clamp
 {
+    // Convolution + bias
+    // Let Bias = B = Sw * Sx * Qb
+    // Where Qb is int32
+    // Y = W @ X + B
+    // Sy * Qy = Sw * Qw @ Sx * Qx + Sw * Sx * Qb
+    // Qy = [(Sw*Sx)/Sy] * (Qw @ Qx + Qb)
+
+    // For activation, Z = Activaiton(Y)
+    // Sz * Qz = Activation(Sy * Qy)
+    // Qz = Sy / Sz * Activation(Qy) = [(Sw*Sx)/Sz] * Activation(Qw @ Qx + Qb)
     Add_Activation_Mul_Clamp(float requantScale, Activation activationOp)
         : requantScale_(requantScale), activationOp_(activationOp)
     {
@@ -139,11 +201,18 @@ struct Add_Activation_Mul2_Clamp
 };
 
 // For Activation function which is non piecewise linear function, such as TanH, Sigmoid ...etc
+// If an activation is not piecewise linear function
+// then Activation(Sy * Qy) != Sy * Activation(Qy)
 template <typename Activation>
 struct Add_Mul_Activation_Mul_Clamp
 {
-    Add_Mul_Activation_Mul_Clamp(float requantScale1, float requantScale2, Activation activationOp)
-        : requantScale1_(requantScale1), requantScale2_(requantScale2), activationOp_(activationOp)
+    // Convolution + Activation (non piecewise linear function)
+    // Z = Activation(Y) = Activation(W @ X + B)
+    // Sz * Qz = Activation(Sy * Qy)
+    // Qz = S1 * Activation[Sacc * (Qw @ Qx + Qb)]
+    // Where S1 = 1 / Sz, Sacc = Sw * Sx
+    Add_Mul_Activation_Mul_Clamp(float scale_z_inv, float scaleAcc, Activation activationOp)
+        : scale_z_inv_(scale_z_inv), scaleAcc_(scaleAcc), activationOp_(activationOp)
     {
     }
 
@@ -151,14 +220,64 @@ struct Add_Mul_Activation_Mul_Clamp
     operator()(int8_t& y, const int32_t& x, const int32_t& bias) const
     {
         float y_fp32 = ck::type_convert<float>(x + bias);
-        y_fp32       = requantScale1_ * y_fp32;
+        y_fp32       = scaleAcc_ * y_fp32;
+        activationOp_(y_fp32, y_fp32);
+        y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
+        y      = ck::type_convert<int8_t>(y_fp32);
+    }
+
+    __host__ __device__ constexpr void
+    operator()(int32_t& y, const int32_t& x, const int32_t& bias) const
+    {
+        // CAUSION - We might type_convert to int8 in threadwise copy
+        // eg. GridwiseGemmDlMultipleD_km_kn_mn
+        float y_fp32 = ck::type_convert<float>(x + bias);
+        y_fp32       = scaleAcc_ * y_fp32;
         activationOp_(y_fp32, y_fp32);
-        y_fp32 = math::clamp(requantScale2_ * y_fp32, -128.f, 127.f);
+        y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
+        y      = ck::type_convert<int32_t>(y_fp32);
+    }
+
+    float scale_z_inv_;
+    float scaleAcc_;
+    Activation activationOp_;
+};
+
+// Conv Perchannel quantization + Activation function which is non piecewise linear function,
+// such as TanH, Sigmoid ...etc
+// If an activation is not piecewise linear function
+// then Activation(Sy *Qy) != Sy * Activation(Qy)
+template <typename Activation>
+struct Add_Mul2_Activation_Mul_Clamp
+{
+    Add_Mul2_Activation_Mul_Clamp(float scale_z_inv, Activation activationOp)
+        : scale_z_inv_(scale_z_inv), activationOp_(activationOp)
+    {
+    }
+
+    __host__ __device__ constexpr void
+    operator()(int8_t& y, const int32_t& x, const int32_t& bias, const float& scaleAcc) const
+    {
+        float y_fp32 = ck::type_convert<float>(x + bias);
+        y_fp32       = scaleAcc * y_fp32;
+        activationOp_(y_fp32, y_fp32);
+        y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
         y      = ck::type_convert<int8_t>(y_fp32);
     }
 
-    float requantScale1_;
-    float requantScale2_;
+    __host__ __device__ constexpr void
+    operator()(int32_t& y, const int32_t& x, const int32_t& bias, const float& scaleAcc) const
+    {
+        // CAUSION - We might type_convert to int8 in threadwise copy
+        // eg. GridwiseGemmDlMultipleD_km_kn_mn
+        float y_fp32 = ck::type_convert<float>(x + bias);
+        y_fp32       = scaleAcc * y_fp32;
+        activationOp_(y_fp32, y_fp32);
+        y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
+        y      = ck::type_convert<int32_t>(y_fp32);
+    }
+
+    float scale_z_inv_;
     Activation activationOp_;
 };
 
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index 6b4df3b60..f1f3042ad 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -320,6 +320,19 @@ struct Sigmoid
     int32_t divider_ = 1;
 };
 
+struct TanH
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        y = ck::math::tanh(x);
+    };
+};
+
 } // namespace element_wise
 } // namespace tensor_operation
 } // namespace ck
diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp
index 4febace0b..a3732b2fe 100644
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -92,6 +92,15 @@ static inline __host__ float sqrt(float x) { return std::sqrt(x); };
 
 static inline __host__ double sqrt(double x) { return std::sqrt(x); };
 
+static inline __host__ half_t tanh(half_t x)
+{
+    return static_cast<half_t>(std::tanh(static_cast<float>(x)));
+};
+
+static inline __host__ float tanh(float x) { return std::tanh(x); };
+
+static inline __host__ double tanh(double x) { return std::tanh(x); };
+
 // math functions for the HIP kernel,  some are implemented by calling hip builtin functions
 
 static inline __device__ float abs(float x) { return ::abs(x); };
@@ -172,5 +181,14 @@ static inline __device__ float sqrt(float x) { return __builtin_amdgcn_sqrtf(x);
 
 static inline __device__ double sqrt(double x) { return __builtin_amdgcn_sqrt(x); };
 
+static inline __device__ half_t tanh(half_t x)
+{
+    return static_cast<half_t>(::tanhf(static_cast<float>(x)));
+};
+
+static inline __device__ float tanh(float x) { return ::tanhf(x); };
+
+static inline __device__ double tanh(double x) { return ::tanh(x); };
+
 } // namespace math
 } // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index 104b21a3e..0bde4919a 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -85,6 +85,7 @@ using GK_GK_Tuple = ck::Tuple<GK, GK>;
 // pointwise functor
 using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
 using Relu           = ck::tensor_operation::element_wise::Relu;
+using TanH           = ck::tensor_operation::element_wise::TanH;
 using Scale          = ck::tensor_operation::element_wise::Scale;
 using Bilinear       = ck::tensor_operation::element_wise::Bilinear;
 using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
@@ -102,6 +103,10 @@ template <typename Activation>
 using Add_Activation_Mul_Clamp =
     ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<Activation>;
 
+template <typename Activation>
+using Add_Mul_Activation_Mul_Clamp =
+    ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<Activation>;
+
 template <typename Activation>
 using Activation_Mul2_Clamp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<Activation>;
 
@@ -109,6 +114,10 @@ template <typename Activation>
 using Add_Activation_Mul2_Clamp =
     ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<Activation>;
 
+template <typename Activation>
+using Add_Mul2_Activation_Mul_Clamp =
+    ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<Activation>;
+
 template <typename DeviceOp, typename Tag = void>
 struct DeviceOperationInstanceFactory;
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
index 57c971e52..793dc8d04 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
@@ -49,6 +49,22 @@ void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
                                                               Add_Activation_Mul2_Clamp<Relu>>>>&
         instances);
 
+void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                      GNHWC,
+                                                      GKYXC,
+                                                      GK_GK_Tuple,
+                                                      GNHWK,
+                                                      int8_t,
+                                                      int8_t,
+                                                      I32_F32_Tuple,
+                                                      int8_t,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      Add_Mul2_Activation_Mul_Clamp<TanH>>>>&
+        instances);
+
 void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
     std::vector<
         std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
@@ -80,6 +96,23 @@ void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
                                                               Add_Activation_Mul2_Clamp<Relu>>>>&
         instances);
 
+void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                      GNHWC,
+                                                      GKYXC,
+                                                      GK_GK_Tuple,
+                                                      GNHWK,
+                                                      int8_t,
+                                                      int8_t,
+                                                      I32_F32_Tuple,
+                                                      int8_t,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      Add_Mul2_Activation_Mul_Clamp<TanH>>>>&
+        instances);
+
+// piecewise activation function
 template <ck::index_t NumDimSpatial,
           typename InLayout,
           typename WeiLayout,
@@ -145,6 +178,67 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
     }
 };
 
+// non-piecewise activation function
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename DsDataType,
+          typename OutDataType,
+          typename Activation>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    DsLayout,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    DsDataType,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    Add_Mul2_Activation_Mul_Clamp<Activation>>>
+{
+    using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                   InLayout,
+                                                   WeiLayout,
+                                                   DsLayout,
+                                                   OutLayout,
+                                                   InDataType,
+                                                   WeiDataType,
+                                                   DsDataType,
+                                                   OutDataType,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Add_Mul2_Activation_Mul_Clamp<Activation>>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+                     is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_GK_Tuple> &&
+                     is_same_v<OutLayout, GNHWK>)
+        {
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<DsDataType, I32_F32_Tuple> && is_same_v<OutDataType, int8_t>)
+            {
+                if constexpr(is_same_v<Activation, TanH>)
+                {
+                    add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(op_ptrs);
+                    add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(op_ptrs);
+                }
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
index 9f8ac9b7b..c570f7675 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
@@ -49,6 +49,21 @@ void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
                                                               Add_Activation_Mul_Clamp<Relu>>>>&
         instances);
 
+void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              I32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Add_Mul_Activation_Mul_Clamp<TanH>>>>&
+        instances);
+
 void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
     std::vector<
         std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
@@ -80,6 +95,22 @@ void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
                                                               Add_Activation_Mul_Clamp<Relu>>>>&
         instances);
 
+void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              I32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Add_Mul_Activation_Mul_Clamp<TanH>>>>&
+        instances);
+
+// piecewise activation function
 template <ck::index_t NumDimSpatial,
           typename InLayout,
           typename WeiLayout,
@@ -145,6 +176,67 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
     }
 };
 
+// non-piecewise activation function
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename DsDataType,
+          typename OutDataType,
+          typename Activation>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    DsLayout,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    DsDataType,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    Add_Mul_Activation_Mul_Clamp<Activation>>>
+{
+    using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                   InLayout,
+                                                   WeiLayout,
+                                                   DsLayout,
+                                                   OutLayout,
+                                                   InDataType,
+                                                   WeiDataType,
+                                                   DsDataType,
+                                                   OutDataType,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Add_Mul_Activation_Mul_Clamp<Activation>>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+                     is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_Tuple> &&
+                     is_same_v<OutLayout, GNHWK>)
+        {
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<DsDataType, I32_Tuple> && is_same_v<OutDataType, int8_t>)
+            {
+                if constexpr(is_same_v<Activation, TanH>)
+                {
+                    add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(op_ptrs);
+                    add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(op_ptrs);
+                }
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
index 7729e4263..b231f8c95 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
@@ -25,6 +25,7 @@ using GNHWK       = ck::tensor_layout::convolution::GNHWK;
 using GK          = ck::tensor_layout::convolution::G_K;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using Relu        = ck::tensor_operation::element_wise::Relu;
+using TanH        = ck::tensor_operation::element_wise::TanH;
 
 using GK_Tuple      = ck::Tuple<GK>;
 using GK_GK_Tuple   = ck::Tuple<GK, GK>;
@@ -32,17 +33,25 @@ using I32_Tuple     = ck::Tuple<int32_t>;
 using F32_Tuple     = ck::Tuple<float>;
 using I32_F32_Tuple = ck::Tuple<int32_t, float>;
 
+// perlayer
 using Mul_Clamp      = ck::tensor_operation::element_wise::Activation_Mul_Clamp<PassThrough>;
 using Relu_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Relu>;
 
+// bias + perlayer
 using Add_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<PassThrough>;
 using Add_Relu_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<Relu>;
+using Add_Mul_TanH_Mul_Clamp =
+    ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<TanH>;
 
+// perchannel
 using Mul2_Clamp      = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<PassThrough>;
 using Relu_Mul2_Clamp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<Relu>;
 
+// bias + perchannel
 using Add_Mul2_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<PassThrough>;
 using Add_Relu_Mul2_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<Relu>;
+using Add_Mul2_TanH_Mul_Clamp =
+    ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<TanH>;
 
 static constexpr ck::index_t NDimSpatial = 2;
 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
index ba2451101..ae5c1d7c3 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
@@ -76,6 +76,42 @@ void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
                                                                            ConvFwd1x1S1P0,
                                                                            4>{});
 }
+
+void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              I32_F32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Add_Mul2_TanH_Mul_Clamp>>>& instances)
+{
+    // dl
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
+                                                                           I32_F32_Tuple,
+                                                                           Add_Mul2_TanH_Mul_Clamp,
+                                                                           ConvFwdDefault,
+                                                                           4>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
+                                                                           I32_F32_Tuple,
+                                                                           Add_Mul2_TanH_Mul_Clamp,
+                                                                           ConvFwd1x1P0,
+                                                                           4>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
+                                                                           I32_F32_Tuple,
+                                                                           Add_Mul2_TanH_Mul_Clamp,
+                                                                           ConvFwd1x1S1P0,
+                                                                           4>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
index ea1c953bb..192d5c9a5 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
@@ -76,6 +76,43 @@ void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
                                                                            ConvFwd1x1S1P0,
                                                                            4>{});
 }
+
+void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              I32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Add_Mul_TanH_Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                                                           I32_Tuple,
+                                                                           Add_Mul_TanH_Mul_Clamp,
+                                                                           ConvFwdDefault,
+                                                                           4>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                                                           I32_Tuple,
+                                                                           Add_Mul_TanH_Mul_Clamp,
+                                                                           ConvFwd1x1P0,
+                                                                           4>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                                                           I32_Tuple,
+                                                                           Add_Mul_TanH_Mul_Clamp,
+                                                                           ConvFwd1x1S1P0,
+                                                                           4>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
index 25e2cda9c..b6e8ee159 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
@@ -74,6 +74,41 @@ void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
                                                                             ConvFwd1x1S1P0,
                                                                             8>{});
 }
+
+void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              I32_F32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Add_Mul2_TanH_Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                                                            I32_F32_Tuple,
+                                                                            Add_Mul2_TanH_Mul_Clamp,
+                                                                            ConvFwdDefault,
+                                                                            8>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                                                            I32_F32_Tuple,
+                                                                            Add_Mul2_TanH_Mul_Clamp,
+                                                                            ConvFwd1x1P0,
+                                                                            8>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                                                            I32_F32_Tuple,
+                                                                            Add_Mul2_TanH_Mul_Clamp,
+                                                                            ConvFwd1x1S1P0,
+                                                                            8>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
index d598d3d38..70f92cec3 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
@@ -76,6 +76,43 @@ void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
                                                                             ConvFwd1x1S1P0,
                                                                             8>{});
 }
+
+void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              I32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Add_Mul_TanH_Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                                                            I32_Tuple,
+                                                                            Add_Mul_TanH_Mul_Clamp,
+                                                                            ConvFwdDefault,
+                                                                            8>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                                                            I32_Tuple,
+                                                                            Add_Mul_TanH_Mul_Clamp,
+                                                                            ConvFwd1x1P0,
+                                                                            8>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                                                            I32_Tuple,
+                                                                            Add_Mul_TanH_Mul_Clamp,
+                                                                            ConvFwd1x1S1P0,
+                                                                            8>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
-- 
GitLab


From dbd8f94bef7882bbba2dcabd888d5754a628772f Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Wed, 29 Mar 2023 15:05:32 -0500
Subject: [PATCH 08/71] Add a denorm test fix (#603)

* Add type_convert implementations for bf16

* Add the fix for conv_fwd

* Add the fix for conv_bwd_data

* Add the fix for conv_bwd_weight

* Format

* Format

* Another format

* Add a macro to use workaround on MI200 only

* Format

---------

Co-authored-by: Rosty Geyyer <rosty.geyyer@amd.com>
Co-authored-by: zjing14 <zhangjing14@gmail.com>
---
 .../grouped_conv_fwd_xdl_fp16.cpp             |  2 +-
 .../device_grouped_conv_fwd_multiple_d.hpp    |  2 +-
 ...ouped_conv_fwd_multiple_d_xdl_cshuffle.hpp |  2 +-
 .../gridwise_gemm_multiple_d_xdl_cshuffle.hpp | 26 +++++---
 .../gpu/grid/gridwise_gemm_pipeline_v1.hpp    |  2 +-
 .../grid/gridwise_gemm_xdlops_bwd_weight.hpp  | 60 ++++++++++---------
 .../gpu/grid/gridwise_gemm_xdlops_v2r3.hpp    | 44 ++++++++------
 .../threadwise_tensor_slice_transfer_v3r1.hpp |  2 +-
 include/ck/utility/data_type.hpp              | 56 ++++++++++++++++-
 9 files changed, 138 insertions(+), 58 deletions(-)

diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp
index 6de1daa3d..498eda244 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
index 1e2f81915..1cc30fd9e 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
index 8de81285d..7bab2d040 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
index da0b0cea2..d49c96f86 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -92,6 +92,17 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
     using GridwiseGemmPipe = remove_cvref_t<decltype(
         GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>())>;
 
+    // denorm test fix, required to work around fp16 mfma issue
+    // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
+    // when mfma if fixed, remove this section and update
+    // ABDataTypeAdjusted -> ABDataType throughout this file
+#if defined(__gfx90a__)
+    using ABDataTypeAdjusted =
+        conditional_t<is_same_v<ABDataType, ck::half_t>, ck::bhalf_t, ABDataType>;
+#else
+    using ABDataTypeAdjusted = ABDataType;
+#endif
+
     __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
         // A matrix in LDS memory, dst of blockwise copy
@@ -397,7 +408,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                                                 ABlockTransferThreadClusterLengths_AK0_M_AK1,
                                                 ABlockTransferThreadClusterArrangeOrder,
                                                 ABDataType,
-                                                ABDataType,
+                                                ABDataTypeAdjusted,
                                                 decltype(a_grid_desc_ak0_m_ak1),
                                                 decltype(a_block_desc_ak0_m_ak1),
                                                 ABlockTransferSrcAccessOrder,
@@ -428,7 +439,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                                                 BBlockTransferThreadClusterLengths_BK0_N_BK1,
                                                 BBlockTransferThreadClusterArrangeOrder,
                                                 ABDataType,
-                                                ABDataType,
+                                                ABDataTypeAdjusted,
                                                 decltype(b_grid_desc_bk0_n_bk1),
                                                 decltype(b_block_desc_bk0_n_bk1),
                                                 BBlockTransferSrcAccessOrder,
@@ -458,11 +469,11 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
         // sanity check
         constexpr index_t KPack =
             math::max(math::lcm(AK1, BK1),
-                      MfmaSelector<ABDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+                      MfmaSelector<ABDataTypeAdjusted, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
-            ABDataType,
+            ABDataTypeAdjusted,
             AccDataType,
             decltype(a_block_desc_ak0_m_ak1),
             decltype(b_block_desc_bk0_n_bk1),
@@ -480,10 +491,11 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
             a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
 
         auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ABDataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+            static_cast<ABDataTypeAdjusted*>(p_shared),
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize());
 
         auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ABDataType*>(p_shared) + a_block_space_size_aligned,
+            static_cast<ABDataTypeAdjusted*>(p_shared) + a_block_space_size_aligned,
             b_block_desc_bk0_n_bk1.GetElementSpaceSize());
 
         constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
index e9097552c..d1209636d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
index 126887cba..2b66898b1 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -166,15 +166,12 @@ __global__ void
                                       const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
                                                   p_b_grid,
                                                   p_c_grid,
-                                                  p_shared_block,
+                                                  p_shared,
                                                   a_b_k0_m_k1_grid_desc,
                                                   b_b_k0_n_k1_grid_desc,
                                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
@@ -183,16 +180,16 @@ __global__ void
                                                   c_element_op,
                                                   c_block_cluster_adaptor);
 #else
-    ignore = p_a_grid;
-    ignore = p_b_grid;
-    ignore = p_c_grid;
-    ignore = a_b_k0_m_k1_grid_desc;
-    ignore = b_b_k0_n_k1_grid_desc;
-    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore = a_element_op;
-    ignore = b_element_op;
-    ignore = c_element_op;
-    ignore = c_block_cluster_adaptor;
+    ignore                = p_a_grid;
+    ignore                = p_b_grid;
+    ignore                = p_c_grid;
+    ignore                = a_b_k0_m_k1_grid_desc;
+    ignore                = b_b_k0_n_k1_grid_desc;
+    ignore                = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore                = a_element_op;
+    ignore                = b_element_op;
+    ignore                = c_element_op;
+    ignore                = c_block_cluster_adaptor;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 
@@ -264,6 +261,16 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
     using GridwiseGemmPipe = remove_cvref_t<decltype(
         GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage>())>;
 
+    // denorm test fix, required to work around fp16 mfma issue
+    // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
+    // when mfma if fixed, remove this section and update
+    // FloatABAdjusted -> FloatAB throughout this file
+#if defined(__gfx90a__)
+    using FloatABAdjusted = conditional_t<is_same_v<FloatAB, ck::half_t>, ck::bhalf_t, FloatAB>;
+#else
+    using FloatABAdjusted = FloatAB;
+#endif
+
     // M0/M1/M1Padding
     static constexpr auto M1PerBlock = Number<ABlockLdsM1PerBlock>{};
     static constexpr auto M0PerBlock = Number<ABlockLdsM0PerBlock>{};
@@ -605,7 +612,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
     __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
                                const FloatAB* __restrict__ p_b_grid,
                                FloatC* __restrict__ p_c_grid,
-                               FloatAB* __restrict__ p_shared_block,
+                               void* __restrict__ p_shared,
                                const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
                                const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
                                const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
@@ -666,7 +673,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
                                                 ABlockTransferThreadClusterLengths_K0_M_K1,
                                                 ABlockTransferThreadClusterArrangeOrder,
                                                 FloatAB,
-                                                FloatAB,
+                                                FloatABAdjusted,
                                                 decltype(a_b_k0_m_k1_grid_desc),
                                                 decltype(a_b_k0_m_k1_block_desc),
                                                 ABlockTransferSrcAccessOrder,
@@ -696,7 +703,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
                                                 BBlockTransferThreadClusterLengths_K0_N_K1,
                                                 BBlockTransferThreadClusterArrangeOrder,
                                                 FloatAB,
-                                                FloatAB,
+                                                FloatABAdjusted,
                                                 decltype(b_b_k0_n_k1_grid_desc),
                                                 decltype(b_b_k0_n_k1_block_desc),
                                                 BBlockTransferSrcAccessOrder,
@@ -725,11 +732,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
         // sanity check
 
         constexpr index_t KPack =
-            math::max(K1, MfmaSelector<FloatAB, MPerXDL, NPerXDL>::selected_mfma.k_per_blk);
+            math::max(K1, MfmaSelector<FloatABAdjusted, MPerXDL, NPerXDL>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm =
             BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                FloatAB,
+                                                                FloatABAdjusted,
                                                                 FloatAcc,
                                                                 decltype(a_k0_m_k1_block_desc),
                                                                 decltype(b_k0_n_k1_block_desc),
@@ -745,16 +752,15 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
         constexpr auto a_block_space_size =
             math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
 
-        FloatAB* p_a_block = p_shared_block;
-        FloatAB* p_b_block = p_shared_block + a_block_space_size;
-
         constexpr auto a_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
         constexpr auto b_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
 
         auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
+            static_cast<FloatABAdjusted*>(p_shared), a_k0_m_k1_block_desc.GetElementSpaceSize());
+
         auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());
+            static_cast<FloatABAdjusted*>(p_shared) + a_block_space_size,
+            b_k0_n_k1_block_desc.GetElementSpaceSize());
 
         // gridwise GEMM pipeline
         const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock);
@@ -798,8 +804,6 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
             constexpr auto c_block_desc_mblock_mperblock_nblock_nperblock =
                 GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
 
-            void* p_shared = static_cast<void*>(p_shared_block);
-
             auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
                 static_cast<FloatC*>(p_shared),
                 c_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
index d1149c0c2..02b008134 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -58,16 +58,16 @@ __global__ void
                                                   c_element_op,
                                                   block_2_ctile_map);
 #else
-    ignore = p_a_grid;
-    ignore = p_b_grid;
-    ignore = p_c_grid;
-    ignore = a_grid_desc_k0_m_k1;
-    ignore = b_grid_desc_k0_n_k1;
-    ignore = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2;
-    ignore = a_element_op;
-    ignore = b_element_op;
-    ignore = c_element_op;
-    ignore = block_2_ctile_map;
+    ignore                = p_a_grid;
+    ignore                = p_b_grid;
+    ignore                = p_c_grid;
+    ignore                = a_grid_desc_k0_m_k1;
+    ignore                = b_grid_desc_k0_n_k1;
+    ignore                = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2;
+    ignore                = a_element_op;
+    ignore                = b_element_op;
+    ignore                = c_element_op;
+    ignore                = block_2_ctile_map;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 
@@ -131,6 +131,16 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
     using GridwiseGemmPipe = remove_cvref_t<decltype(
         GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>())>;
 
+    // denorm test fix, required to work around fp16 mfma issue
+    // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
+    // when mfma if fixed, remove this section and update
+    // FloatABAdjusted -> FloatAB throughout this file
+#if defined(__gfx90a__)
+    using FloatABAdjusted = conditional_t<is_same_v<FloatAB, ck::half_t>, ck::bhalf_t, FloatAB>;
+#else
+    using FloatABAdjusted = FloatAB;
+#endif
+
     __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1()
     {
         constexpr auto max_lds_align = K1;
@@ -281,7 +291,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
         using BlockwiseGemm =
             BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                FloatAB,
+                                                                FloatABAdjusted,
                                                                 FloatAcc,
                                                                 decltype(a_block_desc_k0_m_k1),
                                                                 decltype(b_block_desc_k0_n_k1),
@@ -367,7 +377,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                                 ABlockTransferThreadClusterLengths_K0_M_K1,
                                                 ABlockTransferThreadClusterArrangeOrder,
                                                 FloatAB,
-                                                FloatAB,
+                                                FloatABAdjusted,
                                                 decltype(a_grid_desc_k0_m_k1),
                                                 decltype(a_block_desc_k0_m_k1),
                                                 ABlockTransferSrcAccessOrder,
@@ -398,7 +408,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                                 BBlockTransferThreadClusterLengths_K0_N_K1,
                                                 BBlockTransferThreadClusterArrangeOrder,
                                                 FloatAB,
-                                                FloatAB,
+                                                FloatABAdjusted,
                                                 decltype(b_grid_desc_k0_n_k1),
                                                 decltype(b_block_desc_k0_n_k1),
                                                 BBlockTransferSrcAccessOrder,
@@ -428,7 +438,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         // sanity check
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
-            FloatAB,
+            FloatABAdjusted,
             FloatAcc,
             decltype(a_block_desc_k0_m_k1),
             decltype(b_block_desc_k0_n_k1),
@@ -446,10 +456,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
 
         auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<FloatAB*>(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize());
+            static_cast<FloatABAdjusted*>(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize());
 
         auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
+            static_cast<FloatABAdjusted*>(p_shared) + a_block_space_size_aligned,
             b_block_desc_k0_n_k1.GetElementSpaceSize());
 
         constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
index bb28c194f..cba06f8e8 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index 519003614..079b0cb86 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -1008,6 +1008,60 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float
     return uint16_t(u.int32 >> 16);
 }
 
+// convert bfp16 to fp16 via fp32
+template <>
+inline __host__ __device__ constexpr half_t type_convert<half_t, bhalf_t>(bhalf_t x)
+{
+    float x_fp32 = type_convert<float>(x);
+
+    return static_cast<half_t>(x_fp32);
+}
+
+// convert fp16 to bfp16 via fp32
+template <>
+inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, half_t>(half_t x)
+{
+    float x_fp32 = static_cast<float>(x);
+
+    return type_convert<bhalf_t>(x_fp32);
+}
+
+// convert bfp16 to int32 via fp32
+template <>
+inline __host__ __device__ constexpr int32_t type_convert<int32_t, bhalf_t>(bhalf_t x)
+{
+    float x_fp32 = type_convert<float>(x);
+
+    return static_cast<int32_t>(x_fp32);
+}
+
+// convert int32 to bfp16 via fp32
+template <>
+inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int32_t>(int32_t x)
+{
+    float x_fp32 = static_cast<float>(x);
+
+    return type_convert<bhalf_t>(x_fp32);
+}
+
+// convert bfp16 to int8 via fp32
+template <>
+inline __host__ __device__ constexpr int8_t type_convert<int8_t, bhalf_t>(bhalf_t x)
+{
+    float x_fp32 = type_convert<float>(x);
+
+    return static_cast<int8_t>(x_fp32);
+}
+
+// convert int8 to bfp16 via fp32
+template <>
+inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_t x)
+{
+    float x_fp32 = static_cast<float>(x);
+
+    return type_convert<bhalf_t>(x_fp32);
+}
+
 template <typename T>
 struct NumericLimits
 {
-- 
GitLab


From bb5530af91352dca062b791313d9b77700335ae9 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Thu, 30 Mar 2023 08:03:07 +0800
Subject: [PATCH 09/71] simplify karg in device/grid of split-k op (#644)

* simplify karg in device/grid split-k op

* fix mk_kn_mn instances

* add more instances

* use name from tensor layout
---
 .../impl/device_gemm_xdl_splitk_c_shuffle.hpp | 466 ++---------------
 .../gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp  | 484 +++++++++++++-----
 ...l_splitk_f16_f16_f16_mk_kn_mn_instance.cpp |  27 +-
 3 files changed, 438 insertions(+), 539 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
index 0d2aeaeb7..1f08cec67 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -73,157 +73,18 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
     static constexpr auto I2 = Number<2>{};
     static constexpr auto I3 = Number<3>{};
 
-    static constexpr auto K1Number = Number<K1>{};
-
-    static auto
-    MakeAGridDescriptor_KBatch_K0_M_K1(index_t M, index_t K, index_t StrideA, int KBatch, int KPad)
-    {
-        assert(KPad % (K1 * KBatch) == 0);
-
-        const index_t K0 = KPad / (K1 * KBatch);
-
-        const auto a_grid_desc_m_k = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
-            }
-        }();
-
-        const auto a_grid_desc_m_kpad = transform_tensor_descriptor(
-            a_grid_desc_m_k,
-            make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-            return transform_tensor_descriptor(
-                a_grid_desc_m_kpad,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
-                           make_right_pad_transform(M, PadM)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                a_grid_desc_m_kpad,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
-                           make_pass_through_transform(M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-    }
-
-    static auto
-    MakeBGridDescriptor_KBatch_K0_N_K1(index_t K, index_t N, index_t StrideB, int KBatch, int KPad)
-    {
-        assert(KPad % (K1 * KBatch) == 0);
-
-        const index_t K0 = KPad / (K1 * KBatch);
-
-        const auto b_grid_desc_k_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
-            }
-        }();
-
-        const auto b_grid_desc_kpad_n = transform_tensor_descriptor(
-            b_grid_desc_k_n,
-            make_tuple(make_right_pad_transform(K, KPad - K), make_pass_through_transform(N)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-            return transform_tensor_descriptor(
-                b_grid_desc_kpad_n,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
-                           make_right_pad_transform(N, PadN)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                b_grid_desc_kpad_n,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
-                           make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-    }
-
-    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
-    {
-        const auto c_grid_desc_m_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
-            }
-        }();
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-    }
-
-    static auto GetKPad(index_t K, index_t KBatch)
-    {
-        const index_t K0   = math::integer_divide_ceil(K, K1 * K0PerBlock * KBatch) * K0PerBlock;
-        const index_t KPad = KBatch * K0 * K1;
-        return KPad;
-    }
-
-    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_KBatch_K0_M_K1(1, 1, 1, 1, 1));
-    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_KBatch_K0_N_K1(1, 1, 1, 1, 1));
-    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
-
-    // GridwiseGemm
     using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
         BlockSize,
         ADataType, // TODO: distinguish A/B datatype
         AccDataType,
         CDataType,
-        InMemoryDataOperationEnum::Set,
-        AGridDesc_K0_M_K1,
-        BGridDesc_K0_N_K1,
-        CGridDesc_M_N,
+        ALayout,
+        BLayout,
+        CLayout,
         AElementwiseOperation,
         BElementwiseOperation,
         CElementwiseOperation,
+        GemmSpec,
         MPerBlock,
         NPerBlock,
         K0PerBlock,
@@ -253,236 +114,64 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
         CBlockTransferScalarPerVector_NWaveNPerXDL,
         CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
 
-    // GridwiseGemm
-    using GridwiseGemmAtomicAdd = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
-        BlockSize,
-        ADataType, // TODO: distinguish A/B datatype
-        AccDataType,
-        CDataType,
-        InMemoryDataOperationEnum::AtomicAdd,
-        AGridDesc_K0_M_K1,
-        BGridDesc_K0_N_K1,
-        CGridDesc_M_N,
-        AElementwiseOperation,
-        BElementwiseOperation,
-        CElementwiseOperation,
-        MPerBlock,
-        NPerBlock,
-        K0PerBlock,
-        MPerXDL,
-        NPerXDL,
-        K1,
-        MXdlPerWave,
-        NXdlPerWave,
-        ABlockTransferThreadClusterLengths_K0_M_K1,
-        ABlockTransferThreadClusterArrangeOrder,
-        ABlockTransferSrcAccessOrder,
-        ABlockTransferSrcVectorDim,
-        ABlockTransferSrcScalarPerVector,
-        ABlockTransferDstScalarPerVector_K1,
-        false, // AThreadTransferSrcResetCoordinateAfterRun,
-        ABlockLdsAddExtraM,
-        BBlockTransferThreadClusterLengths_K0_N_K1,
-        BBlockTransferThreadClusterArrangeOrder,
-        BBlockTransferSrcAccessOrder,
-        BBlockTransferSrcVectorDim,
-        BBlockTransferSrcScalarPerVector,
-        BBlockTransferDstScalarPerVector_K1,
-        false, // BThreadTransferSrcResetCoordinateAfterRun,
-        BBlockLdsAddExtraN,
-        CShuffleMRepeatPerShuffle,
-        CShuffleNRepeatPerShuffle,
-        CBlockTransferScalarPerVector_NWaveNPerXDL,
-        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
-
-    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}));
-
-    using Block2CTileMap = typename GridwiseGemm::CBlockClusterAdaptor;
-
-    // Argument
-    struct Argument : public BaseArgument
-    {
-        Argument(const ADataType* p_a_grid,
-                 const BDataType* p_b_grid,
-                 CDataType* p_c_grid,
-                 index_t M,
-                 index_t N,
-                 index_t K,
-                 index_t StrideA,
-                 index_t StrideB,
-                 index_t StrideC,
-                 index_t M01,
-                 index_t N01,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op,
-                 index_t k_batch)
-            : p_a_grid_{p_a_grid},
-              p_b_grid_{p_b_grid},
-              p_c_grid_{p_c_grid},
-              a_grid_desc_kbatch_k0_m_k1_{},
-              b_grid_desc_kbatch_k0_n_k1_{},
-              c_grid_desc_m_n_{},
-              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_ctile_map_{},
-              M01_{M01},
-              N01_{N01},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op},
-              k_batch_{k_batch}
-        {
-            int KPad = DeviceGemmXdlSplitKCShuffle::GetKPad(K, k_batch_);
-
-            a_grid_desc_kbatch_k0_m_k1_ =
-                DeviceGemmXdlSplitKCShuffle::MakeAGridDescriptor_KBatch_K0_M_K1(
-                    M, K, StrideA, k_batch_, KPad);
-            b_grid_desc_kbatch_k0_n_k1_ =
-                DeviceGemmXdlSplitKCShuffle::MakeBGridDescriptor_KBatch_K0_N_K1(
-                    K, N, StrideB, k_batch_, KPad);
-            c_grid_desc_m_n_ = DeviceGemmXdlSplitKCShuffle::MakeCGridDescriptor_M_N(M, N, StrideC);
-
-            block_2_ctile_map_ =
-                GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n_, M01, N01, k_batch_);
-
-            if(GridwiseGemm::CheckValidity(a_grid_desc_kbatch_k0_m_k1_,
-                                           b_grid_desc_kbatch_k0_n_k1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n_);
-            }
-        }
-
-        //  private:
-        const ADataType* p_a_grid_;
-        const BDataType* p_b_grid_;
-        CDataType* p_c_grid_;
-        AGridDesc_K0_M_K1 a_grid_desc_kbatch_k0_m_k1_;
-        BGridDesc_K0_N_K1 b_grid_desc_kbatch_k0_n_k1_;
-        CGridDesc_M_N c_grid_desc_m_n_;
-        CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock_;
-        Block2CTileMap block_2_ctile_map_;
-        index_t M01_;
-        index_t N01_;
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
-        index_t k_batch_;
-    };
+    using Argument = typename GridwiseGemm::Argument;
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        using Argument = DeviceGemmXdlSplitKCShuffle::Argument;
 
-        void Print(const Argument& arg)
-        {
-            std::cout << "arg.a_grid_desc_kbatch_k0_m_k1_{"
-                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0) << ", "
-                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1) << ", "
-                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I2) << ", "
-                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I3) << "}" << std::endl;
-
-            std::cout << "arg.b_grid_desc_kbatch_k0_n_k1_{"
-                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I0) << ", "
-                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I1) << ", "
-                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I2) << ", "
-                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I3) << "}" << std::endl;
-
-            std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
-                      << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
-        }
+        void Print(const Argument& karg) { karg.Print(); }
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        float Run(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
-                Print(arg);
+                Print(karg);
             }
 
-            const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
+            const auto kbatch = karg.k_batch;
 
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
-                                            arg.b_grid_desc_kbatch_k0_n_k1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
+            if(!GridwiseGemm::CheckValidity(karg))
             {
                 throw std::runtime_error(
-                    "wrong! GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 has invalid setting");
+                    "wrong! GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 has invalid "
+                    "setting");
             }
 
-            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
-
-            const auto K0 = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(karg);
+            const auto K0           = karg.K0;
 
             const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
 
             float ave_time = 0;
 
             const auto Run = [&](const auto& kernel) {
-                hipGetErrorString(hipMemset(
-                    arg.p_c_grid_,
-                    0,
-                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
-                        sizeof(CDataType)));
-
-                ave_time =
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_,
-                                           arg.p_b_grid_,
-                                           arg.p_c_grid_,
-                                           arg.a_grid_desc_kbatch_k0_m_k1_,
-                                           arg.b_grid_desc_kbatch_k0_n_k1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.block_2_ctile_map_);
+                if(kbatch > 1)
+                    hipGetErrorString(
+                        hipMemset(karg.p_c_grid, 0, karg.M * karg.N * sizeof(CDataType)));
+
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg);
             };
 
             if(has_main_k0_block_loop)
             {
                 if(kbatch == 1)
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
-                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                        AElementwiseOperation,
-                        BElementwiseOperation,
-                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
-                        true>;
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
+                                                             true,
+                                                             InMemoryDataOperationEnum::Set>;
 
                     Run(kernel);
                 }
                 else
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
-                        GridwiseGemmAtomicAdd,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
-                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                        AElementwiseOperation,
-                        BElementwiseOperation,
-                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
-                        true>;
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
+                                                             true,
+                                                             InMemoryDataOperationEnum::AtomicAdd>;
 
                     Run(kernel);
                 }
@@ -491,37 +180,19 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
             {
                 if(kbatch == 1)
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
-                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                        AElementwiseOperation,
-                        BElementwiseOperation,
-                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
-                        false>;
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
+                                                             false,
+                                                             InMemoryDataOperationEnum::Set>;
 
                     Run(kernel);
                 }
                 else
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
-                        GridwiseGemmAtomicAdd,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
-                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                        AElementwiseOperation,
-                        BElementwiseOperation,
-                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
-                        false>;
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
+                                                             false,
+                                                             InMemoryDataOperationEnum::AtomicAdd>;
 
                     Run(kernel);
                 }
@@ -544,12 +215,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
         return true;
     }
 
-    static bool IsSupportedArgument(const Argument& arg)
+    static bool IsSupportedArgument(const Argument& karg)
     {
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
-                                           arg.b_grid_desc_kbatch_k0_n_k1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        return GridwiseGemm::CheckValidity(karg);
     }
 
     // polymorphic
@@ -567,9 +235,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                              index_t StrideA,
                              index_t StrideB,
                              index_t StrideC,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op,
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CElementwiseOperation,
                              index_t KBatch)
     {
         return Argument{p_a,
@@ -581,11 +249,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                         StrideA,
                         StrideB,
                         StrideC,
-                        1,
-                        1,
-                        a_element_op,
-                        b_element_op,
-                        c_element_op,
+                        GridwiseGemm::CalculateMPadded(M),
+                        GridwiseGemm::CalculateNPadded(N),
+                        GridwiseGemm::CalculateKPadded(K),
+                        GridwiseGemm::CalculateK0(K, KBatch),
                         KBatch};
     }
 
@@ -601,9 +268,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                                       index_t StrideA,
                                                       index_t StrideB,
                                                       index_t StrideC,
-                                                      AElementwiseOperation a_element_op,
-                                                      BElementwiseOperation b_element_op,
-                                                      CElementwiseOperation c_element_op,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation,
                                                       ck::index_t KBatch = 1) override
     {
         return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
@@ -615,11 +282,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                           StrideA,
                                           StrideB,
                                           StrideC,
-                                          1,
-                                          1,
-                                          a_element_op,
-                                          b_element_op,
-                                          c_element_op,
+                                          GridwiseGemm::CalculateMPadded(M),
+                                          GridwiseGemm::CalculateNPadded(N),
+                                          GridwiseGemm::CalculateKPadded(K),
+                                          GridwiseGemm::CalculateK0(K, KBatch),
                                           KBatch);
     }
 
@@ -630,31 +296,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
     }
 
     // polymorphic
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "DeviceGemmXdlSplitKCShuffle"
-            << "<"
-            << BlockSize << ", "
-            << MPerBlock << ", "
-            << NPerBlock << ", "
-            << K0PerBlock << ", "
-            << K1 << ", "
-            << MPerXDL << ", "
-            << NPerXDL << ", "
-            << MXdlPerWave << ", "
-            << NXdlPerWave << ", "
-            << ABlockTransferSrcScalarPerVector << ", "
-            << ABlockTransferDstScalarPerVector_K1 << ", "
-            << BBlockTransferSrcScalarPerVector << ", "
-            << BBlockTransferDstScalarPerVector_K1
-            << ">";
-        // clang-format on
-
-        return str.str();
-    }
+    std::string GetTypeString() const override { return GridwiseGemm::GetTypeString(); }
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
index 190194f1e..727f180e9 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -18,60 +18,23 @@
 namespace ck {
 
 template <typename GridwiseGemm,
-          typename FloatAB,
-          typename FloatC,
-          typename AGridDesc_B_K0_M_K1,
-          typename BGridDesc_B_K0_N_K1,
-          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation,
-          typename CBlockClusterAdaptor,
-          bool HasMainKBlockLoop>
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_v2r4r2(const FloatAB* __restrict__ p_a_grid,
-                                  const FloatAB* __restrict__ p_b_grid,
-                                  FloatC* __restrict__ p_c_grid,
-                                  const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
-                                  const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
-                                  const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                  const AElementwiseOperation a_element_op,
-                                  const BElementwiseOperation b_element_op,
-                                  const CElementwiseOperation c_element_op,
-                                  const CBlockClusterAdaptor c_block_cluster_adaptor)
+        kernel_gemm_xdlops_v2r4r2_simplified(typename GridwiseGemm::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_c_grid,
-                                                  static_cast<void*>(p_shared_block),
-                                                  a_b_k0_m_k1_grid_desc,
-                                                  b_b_k0_n_k1_grid_desc,
-                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  c_block_cluster_adaptor);
+    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
+
+    __shared__ uint8_t p_shared[shared_size];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
+        karg, static_cast<void*>(p_shared));
 #else
-    ignore = p_a_grid;
-    ignore = p_b_grid;
-    ignore = p_c_grid;
-    ignore = a_b_k0_m_k1_grid_desc;
-    ignore = b_b_k0_n_k1_grid_desc;
-    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore = a_element_op;
-    ignore = b_element_op;
-    ignore = c_element_op;
-    ignore = c_block_cluster_adaptor;
+    ignore = karg;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 
@@ -79,13 +42,13 @@ template <index_t BlockSize,
           typename FloatAB,
           typename FloatAcc,
           typename FloatC,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AGridDesc_B_K0_M_K1,
-          typename BGridDesc_B_K0_N_K1,
-          typename CMNGridDesc,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
           index_t MPerBlock,
           index_t NPerBlock,
           index_t K0PerBlock,
@@ -126,10 +89,238 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
     static constexpr auto I7 = Number<7>{};
 
     // K1 should be Number<...>
-    static constexpr auto K1 = Number<K1Value>{};
+    static constexpr auto K1  = Number<K1Value>{};
+    static constexpr auto M01 = 1;
+    static constexpr auto N01 = 1;
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        const FloatAB* p_a_grid;
+        const FloatAB* p_b_grid;
+        FloatC* p_c_grid;
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        index_t StrideC;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KPadded;
+        index_t K0;
+        index_t k_batch;
+
+        Argument(const FloatAB* p_a_grid_,
+                 const FloatAB* p_b_grid_,
+                 FloatC* p_c_grid_,
+                 index_t M_,
+                 index_t N_,
+                 index_t K_,
+                 index_t StrideA_,
+                 index_t StrideB_,
+                 index_t StrideC_,
+                 index_t MPadded_,
+                 index_t NPadded_,
+                 index_t KPadded_,
+                 index_t K0_,
+                 index_t k_batch_)
+            : p_a_grid(p_a_grid_),
+              p_b_grid(p_b_grid_),
+              p_c_grid(p_c_grid_),
+              M(M_),
+              N(N_),
+              K(K_),
+              StrideA(StrideA_),
+              StrideB(StrideB_),
+              StrideC(StrideC_),
+              MPadded(MPadded_),
+              NPadded(NPadded_),
+              KPadded(KPadded_),
+              K0(K0_),
+              k_batch(k_batch_)
+        {
+        }
+
+        void Print() const
+        {
+            std::cout << "arg {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KP:" << KPadded << ", "
+                      << "K0:" << K0 << ", "
+                      << "KB:" << k_batch << "}" << std::endl;
+        }
+    };
+
+    __host__ __device__ static auto CalculateGridSize(const Argument& karg)
+    {
+        return std::make_tuple(math::integer_divide_ceil(karg.N, NPerBlock),
+                               math::integer_divide_ceil(karg.M, MPerBlock),
+                               karg.k_batch);
+    }
+
+    // prefer this to be called on host
+    __host__ __device__ static auto CalculateMPadded(index_t M)
+    {
+        return (M + MPerBlock - 1) / MPerBlock * MPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateNPadded(index_t N)
+    {
+        return (N + NPerBlock - 1) / NPerBlock * NPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateK0(index_t K, index_t K_Batch = 1)
+    {
+        // k_batch * k0 * k0_per_block * k1
+        auto K_t = K_Batch * K0PerBlock * K1;
+        return (K + K_t - 1) / K_t * K0PerBlock;
+    }
+
+    __host__ __device__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K0 = CalculateK0(K, K_Batch);
+        return K_Batch * K0 * K1;
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_KBatch_K0_M_K1(index_t M,
+                                                                       index_t MPad,
+                                                                       index_t K,
+                                                                       index_t StrideA,
+                                                                       index_t KBatch,
+                                                                       index_t K0,
+                                                                       index_t KPad)
+    {
+        const auto a_grid_desc_m_k = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto a_grid_desc_m_kpad = transform_tensor_descriptor(
+            a_grid_desc_m_k,
+            make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)
+        {
+            // const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+            return transform_tensor_descriptor(
+                a_grid_desc_m_kpad,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                a_grid_desc_m_kpad,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_KBatch_K0_N_K1(index_t K,
+                                                                       index_t NPad,
+                                                                       index_t N,
+                                                                       index_t StrideB,
+                                                                       index_t KBatch,
+                                                                       index_t K0,
+                                                                       index_t KPad)
+    {
+        const auto b_grid_desc_k_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
+            }
+        }();
+
+        const auto b_grid_desc_kpad_n = transform_tensor_descriptor(
+            b_grid_desc_k_n,
+            make_tuple(make_right_pad_transform(K, KPad - K), make_pass_through_transform(N)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)
+        {
+            // const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+            return transform_tensor_descriptor(
+                b_grid_desc_kpad_n,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                b_grid_desc_kpad_n,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+    }
+
+    __host__ __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t N, index_t MPad, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
+        {
+            return transform_tensor_descriptor(c_grid_desc_m_n,
+                                               make_tuple(make_right_pad_transform(M, MPad - M),
+                                                          make_right_pad_transform(N, NPad - N)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
     __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
     {
         constexpr auto max_lds_align = K1;
@@ -178,45 +369,68 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                          c_block_size * sizeof(FloatC));
     }
 
-    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
-    template <typename Block2CTileMap>
-    __host__ __device__ static constexpr bool
-    CheckValidity(const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
-                  const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
-                  const CMNGridDesc& c_m_n_grid_desc,
-                  const Block2CTileMap& block_2_ctile_map)
+    __host__ __device__ static constexpr bool CheckValidity(const Argument& karg)
     {
-        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
-                      "wrong! K1 need to be known at compile-time");
-
-        static_assert((MPerBlock % (MPerXDL * MRepeat) == 0) &&
-                          (NPerBlock % (NRepeat * NPerXDL)) == 0,
-                      "Invalid tuning param!");
-
-        const auto M      = a_b_k0_m_k1_grid_desc.GetLength(I2);
-        const auto N      = b_b_k0_n_k1_grid_desc.GetLength(I2);
-        const auto K0     = a_b_k0_m_k1_grid_desc.GetLength(I1);
-        const auto KBatch = a_b_k0_m_k1_grid_desc.GetLength(I0);
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(karg.M % MPerBlock == 0))
+                return false;
+        }
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(karg.N % NPerBlock == 0))
+                return false;
+        }
 
-        if(!(M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
-             K0 == b_b_k0_n_k1_grid_desc.GetLength(I1) &&
-             K1 == a_b_k0_m_k1_grid_desc.GetLength(I3) &&
-             K1 == b_b_k0_n_k1_grid_desc.GetLength(I3) &&
-             KBatch == b_b_k0_n_k1_grid_desc.GetLength(I0)))
-            return false;
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+                return false;
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+                return false;
+        }
 
-        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0))
-            return false;
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+                return false;
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+                return false;
+        }
 
-        if(!block_2_ctile_map.CheckValidity(c_m_n_grid_desc))
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
         {
-            return false;
+            if(karg.N % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
+                return false;
+        }
+        else
+        {
+            if(karg.M % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
+                return false;
         }
 
-        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
         return true;
     }
 
+    __host__ __device__ static auto GetKPad(index_t K, index_t KBatch)
+    {
+        const index_t K0   = math::integer_divide_ceil(K, K1 * K0PerBlock * KBatch) * K0PerBlock;
+        const index_t KPad = KBatch * K0 * K1;
+        return KPad;
+    }
+
     __host__ __device__ static constexpr bool CalculateHasMainK0BlockLoop(index_t K0)
     {
         const bool has_main_k0_block_loop = K0 > K0PerBlock;
@@ -224,8 +438,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         return has_main_k0_block_loop;
     }
 
+    template <typename CGridDesc>
     __host__ __device__ static constexpr auto
-    MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(const CMNGridDesc& c_m_n_grid_desc)
+    MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc& c_m_n_grid_desc)
     {
         const auto M = c_m_n_grid_desc.GetLength(I0);
         const auto N = c_m_n_grid_desc.GetLength(I1);
@@ -242,10 +457,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
     }
 
     // return block_id to C matrix tile idx (m0, n0) mapping
+    template <typename CGridDesc>
     __host__ __device__ static constexpr auto MakeCBlockClusterAdaptor(
-        const CMNGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
+        const CGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
     {
-        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CMNGridDesc>(
+        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc>(
             c_m_n_grid_desc, 8, KBatch);
     }
 
@@ -262,24 +478,25 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                        Number<CShuffleNRepeatPerShuffle * NWave * NPerXDL>{}));
     }
 
-    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CMNGridDesc{}));
-    using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1, 1));
-
-    template <bool HasMainKBlockLoop>
-    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
-                               const FloatAB* __restrict__ p_b_grid,
-                               FloatC* __restrict__ p_c_grid,
-                               void* __restrict__ p_shared_block,
-                               const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
-                               const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
-                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
-                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
-                               const AElementwiseOperation& a_element_op,
-                               const BElementwiseOperation& b_element_op,
-                               const CElementwiseOperation& c_element_op,
-                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
+    template <bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation>
+    __device__ static void Run(const Argument& karg, void* __restrict__ p_shared_block)
     {
+        const FloatAB* p_a_grid          = karg.p_a_grid;
+        const FloatAB* p_b_grid          = karg.p_b_grid;
+        FloatC* p_c_grid                 = karg.p_c_grid;
+        const auto a_b_k0_m_k1_grid_desc = MakeAGridDescriptor_KBatch_K0_M_K1(
+            karg.M, karg.MPadded, karg.K, karg.StrideA, karg.k_batch, karg.K0, karg.KPadded);
+        const auto b_b_k0_n_k1_grid_desc = MakeBGridDescriptor_KBatch_K0_N_K1(
+            karg.K, karg.NPadded, karg.N, karg.StrideB, karg.k_batch, karg.K0, karg.KPadded);
+        const auto c_grid_desc_m_n =
+            MakeCGridDescriptor_M_N(karg.M, karg.N, karg.MPadded, karg.NPadded, karg.StrideC);
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n);
+        const AElementwiseOperation a_element_op = AElementwiseOperation{};
+        const BElementwiseOperation b_element_op = BElementwiseOperation{};
+        const CElementwiseOperation c_element_op = CElementwiseOperation{};
+
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -289,26 +506,16 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
 
         const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1);
 
-        // divide block work by [M, N]
-        const auto block_work_idx =
-            c_block_cluster_adaptor.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
-
-        if(!c_block_cluster_adaptor.ValidCTileIndex(
-               make_tuple(block_work_idx[I1], block_work_idx[I2]),
-               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
-                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
-        {
-            return;
-        }
-
-        const index_t k_batch_id = block_work_idx[I0];
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(blockIdx.y);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const index_t k_batch_id = __builtin_amdgcn_readfirstlane(blockIdx.z);
 
         // HACK: this force m/n_block_data_idx_on_grid into SGPR
         const index_t m_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * MPerBlock);
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
 
         const index_t n_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_work_idx[I2] * NPerBlock);
+            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
 
         // lds max alignment
         constexpr auto max_lds_align = K1;
@@ -444,7 +651,6 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
         //       register
         // sanity check
-
         auto blockwise_gemm =
             BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
                                                                 FloatAB,
@@ -647,7 +853,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                 {c_block_desc_mblock_mperblock_nblock_nperblock,
                  make_multi_index(0, 0, 0, 0),
                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                 make_multi_index(block_work_idx[I1], 0, block_work_idx[I2], 0),
+                 make_multi_index(block_m_id, 0, block_n_id, 0),
                  c_element_op};
 
             constexpr auto mxdlperwave_forward_step =
@@ -716,6 +922,48 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
             });
         }
     }
+
+    template <typename Layout>
+    struct LStr
+    {
+        static std::string Get() { return ""; }
+    };
+
+    template <>
+    struct LStr<ck::tensor_layout::gemm::RowMajor>
+    {
+        static std::string Get() { return "R"; }
+    };
+
+    template <>
+    struct LStr<ck::tensor_layout::gemm::ColumnMajor>
+    {
+        static std::string Get() { return "C"; }
+    };
+
+    static std::string GetTypeString()
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "GemmXdlSplitKCShuffle_"
+            << getGemmSpecializationString(GemmSpec) << "_"
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << "_"
+            << "B" << BlockSize << "_"
+            << "Vec" << ABlockTransferSrcScalarPerVector << "x"
+            << BBlockTransferSrcScalarPerVector << "x"
+            << CBlockTransferScalarPerVector_NWaveNPerXDL << "_"
+            << MPerBlock << "x"
+            << NPerBlock << "x"
+            << K0PerBlock << "x"
+            << K1 ;
+        // clang-format on
+
+        return str.str();
+    }
 };
 
 } // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
index 9b5ff4048..c4680db83 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -26,7 +26,8 @@ using S = ck::Sequence<Is...>;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
 
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
 using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple<
@@ -35,14 +36,22 @@ using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple<
         //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,    64,   192,     4,  8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   192,    64,     4,  8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    32,   192,     4,  8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   192,    32,     4,  8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    32,    64,     4,  8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    64,    32,     4,  8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>
     // clang-format on
     >;
 
-- 
GitLab


From 091570f594e6e6b7109e290ce878f4f9b8ad1e9f Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Thu, 30 Mar 2023 08:03:55 +0800
Subject: [PATCH 10/71] fix 3rd dword of buffer source descriptor (#659)

---
 include/ck/ck.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 5009dec5e..9853f19a4 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -36,7 +36,7 @@
 #elif defined(__gfx1030__) // for GPU code
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
 #elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code
-#define CK_BUFFER_RESOURCE_3RD_DWORD 0x10020000
+#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31004000
 #endif
 
 // FMA instruction
-- 
GitLab


From fde6d2742bc7fe56bb86864ff960f8ea26082dfc Mon Sep 17 00:00:00 2001
From: zjing14 <zhangjing14@gmail.com>
Date: Thu, 30 Mar 2023 13:30:43 -0500
Subject: [PATCH 11/71] add fp64 instances (#658)

Co-authored-by: root <root@ctr-ubbsmc15.amd.com>
---
 client_example/04_contraction/CMakeLists.txt  |  14 +-
 ...near.cpp => contraction_bilinear_fp32.cpp} |   0
 .../contraction_bilinear_fp64.cpp             | 281 ++++++++++++++++++
 ...n_scale.cpp => contraction_scale_fp32.cpp} |   0
 .../04_contraction/contraction_scale_fp64.cpp | 270 +++++++++++++++++
 .../device_operation_instance_factory.hpp     |   1 +
 .../gpu/contraction_bilinear.hpp              |  66 ++++
 .../gpu/contraction_scale.hpp                 |  66 ++++
 .../gpu/contraction_bilinear/CMakeLists.txt   |   6 +
 ..._shuffle_f64_f64_f64_f64_kknn_instance.cpp |  76 +++++
 ..._shuffle_f64_f64_f64_f64_knnn_instance.cpp |  76 +++++
 ..._shuffle_f64_f64_f64_f64_mknn_instance.cpp |  76 +++++
 ..._shuffle_f64_f64_f64_f64_mnnn_instance.cpp |  76 +++++
 .../gpu/contraction_scale/CMakeLists.txt      |   6 +
 ...xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp |  75 +++++
 ...xdl_c_shuffle_f64_f64_f64_knn_instance.cpp |  75 +++++
 ...xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp |  75 +++++
 ...xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp |  75 +++++
 18 files changed, 1310 insertions(+), 4 deletions(-)
 rename client_example/04_contraction/{contraction_bilinear.cpp => contraction_bilinear_fp32.cpp} (100%)
 create mode 100644 client_example/04_contraction/contraction_bilinear_fp64.cpp
 rename client_example/04_contraction/{contraction_scale.cpp => contraction_scale_fp32.cpp} (100%)
 create mode 100644 client_example/04_contraction/contraction_scale_fp64.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp

diff --git a/client_example/04_contraction/CMakeLists.txt b/client_example/04_contraction/CMakeLists.txt
index 971d5d9f1..7ffedfeef 100644
--- a/client_example/04_contraction/CMakeLists.txt
+++ b/client_example/04_contraction/CMakeLists.txt
@@ -1,8 +1,14 @@
-add_executable(client_contraction_scale contraction_scale.cpp)
-target_link_libraries(client_contraction_scale PRIVATE composable_kernel::device_operations)
+add_executable(client_contraction_scale_fp32 contraction_scale_fp32.cpp)
+target_link_libraries(client_contraction_scale_fp32 PRIVATE composable_kernel::device_operations)
 
-add_executable(client_contraction_bilinear contraction_bilinear.cpp)
-target_link_libraries(client_contraction_bilinear PRIVATE composable_kernel::device_operations)
+add_executable(client_contraction_bilinear_fp32 contraction_bilinear_fp32.cpp)
+target_link_libraries(client_contraction_bilinear_fp32 PRIVATE composable_kernel::device_operations)
+
+add_executable(client_contraction_scale_fp64 contraction_scale_fp64.cpp)
+target_link_libraries(client_contraction_scale_fp64 PRIVATE composable_kernel::device_operations)
+
+add_executable(client_contraction_bilinear_fp64 contraction_bilinear_fp64.cpp)
+target_link_libraries(client_contraction_bilinear_fp64 PRIVATE composable_kernel::device_operations)
 
 add_executable(contraction_g1m2n3k1_add_xdl_fp16 contraction_g1m2n3k1_add_xdl_fp16.cpp)
 target_link_libraries(contraction_g1m2n3k1_add_xdl_fp16 PRIVATE composable_kernel::device_operations)
diff --git a/client_example/04_contraction/contraction_bilinear.cpp b/client_example/04_contraction/contraction_bilinear_fp32.cpp
similarity index 100%
rename from client_example/04_contraction/contraction_bilinear.cpp
rename to client_example/04_contraction/contraction_bilinear_fp32.cpp
diff --git a/client_example/04_contraction/contraction_bilinear_fp64.cpp b/client_example/04_contraction/contraction_bilinear_fp64.cpp
new file mode 100644
index 000000000..9238e4cd8
--- /dev/null
+++ b/client_example/04_contraction/contraction_bilinear_fp64.cpp
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <numeric>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+using F64 = double;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Bilinear;
+
+using ADataType        = F64;
+using BDataType        = F64;
+using AccDataType      = F64;
+using CShuffleDataType = F64;
+using DDataType        = F64;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F64;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+// kknn
+#if 1
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+// knnn
+#elif 0
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+// mknn
+#elif 0
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+// mnnn
+#elif 0
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+#endif
+
+    float alpha = 1.f;
+    float beta  = 1.f;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 25)
+    {
+        const ck::index_t M0 = std::stoi(argv[1]);
+        const ck::index_t M1 = std::stoi(argv[2]);
+
+        const ck::index_t N0 = std::stoi(argv[3]);
+        const ck::index_t N1 = std::stoi(argv[4]);
+
+        const ck::index_t K0 = std::stoi(argv[5]);
+        const ck::index_t K1 = std::stoi(argv[6]);
+
+        a_ms_ks_lengths = {M0, M1, K0, K1};
+        a_ms_ks_strides = {
+            std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
+
+        b_ns_ks_lengths = {N0, N1, K0, K1};
+        b_ns_ks_strides = {
+            std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
+
+        d_ms_ns_lengths = {M0, M1, N0, N1};
+        d_ms_ns_strides = {
+            std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
+
+        e_ms_ns_lengths = {M0, M1, N0, N1};
+        e_ms_ns_strides = {
+            std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21]), std::stoi(argv[22])};
+
+        alpha = std::stof(argv[23]);
+        beta  = std::stof(argv[24]);
+    }
+    else
+    {
+        printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
+        printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+        printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+        printf("arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
+        printf("arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+        printf("arg23 to 24: alpha, beta\n");
+        exit(0);
+    }
+
+    auto f_tensor_space_size = [](auto lengths, auto strides) {
+        std::size_t space_size = 1;
+        for(std::size_t i = 0; i < lengths.size(); ++i)
+        {
+            space_size += (lengths[i] - 1) * strides[i];
+        }
+        return space_size;
+    };
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) *
+                                 f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) *
+                                 f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
+    SimpleDeviceMem d_device_buf(sizeof(DDataType) *
+                                 f_tensor_space_size(d_ms_ns_lengths, d_ms_ns_strides));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) *
+                                 f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
+
+    using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        ADataType,
+        BDataType,
+        ck::Tuple<DDataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::Bilinear>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{alpha, beta};
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                        b_device_buf.GetDeviceBuffer(),
+                                        std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                                        e_device_buf.GetDeviceBuffer(),
+                                        a_ms_ks_lengths,
+                                        a_ms_ks_strides,
+                                        b_ns_ks_lengths,
+                                        b_ns_ks_strides,
+                                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                                        e_ms_ns_lengths,
+                                        e_ms_ns_strides,
+                                        a_element_op,
+                                        b_element_op,
+                                        cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            ck::index_t M = ck::accumulate_n<ck::index_t>(
+                e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+            ck::index_t N = ck::accumulate_n<ck::index_t>(
+                e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+            ck::index_t K = ck::accumulate_n<ck::index_t>(
+                a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+            std::size_t flop      = std::size_t(2) * M * N * K;
+            std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                    sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return 0;
+}
diff --git a/client_example/04_contraction/contraction_scale.cpp b/client_example/04_contraction/contraction_scale_fp32.cpp
similarity index 100%
rename from client_example/04_contraction/contraction_scale.cpp
rename to client_example/04_contraction/contraction_scale_fp32.cpp
diff --git a/client_example/04_contraction/contraction_scale_fp64.cpp b/client_example/04_contraction/contraction_scale_fp64.cpp
new file mode 100644
index 000000000..3c36aa21e
--- /dev/null
+++ b/client_example/04_contraction/contraction_scale_fp64.cpp
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <numeric>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+using F64 = double;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Scale;
+
+using ADataType        = F64;
+using BDataType        = F64;
+using AccDataType      = F64;
+using CShuffleDataType = F64;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F64;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+// kkn
+#if 1
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+// knn
+#elif 0
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+// mkn
+#elif 0
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+// mnn
+#elif 0
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+#endif
+
+    float scale = 1.f;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 20)
+    {
+        const ck::index_t M0 = std::stoi(argv[1]);
+        const ck::index_t M1 = std::stoi(argv[2]);
+
+        const ck::index_t N0 = std::stoi(argv[3]);
+        const ck::index_t N1 = std::stoi(argv[4]);
+
+        const ck::index_t K0 = std::stoi(argv[5]);
+        const ck::index_t K1 = std::stoi(argv[6]);
+
+        a_ms_ks_lengths = {M0, M1, K0, K1};
+        a_ms_ks_strides = {
+            std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
+
+        b_ns_ks_lengths = {N0, N1, K0, K1};
+        b_ns_ks_strides = {
+            std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
+
+        e_ms_ns_lengths = {M0, M1, N0, N1};
+        e_ms_ns_strides = {
+            std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
+
+        scale = std::stof(argv[19]);
+    }
+    else
+    {
+        printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
+        printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+        printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+        printf("arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+        printf("arg19: scale\n");
+        exit(0);
+    }
+
+    auto f_tensor_space_size = [](auto lengths, auto strides) {
+        std::size_t space_size = 1;
+        for(std::size_t i = 0; i < lengths.size(); ++i)
+        {
+            space_size += (lengths[i] - 1) * strides[i];
+        }
+        return space_size;
+    };
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) *
+                                 f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) *
+                                 f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) *
+                                 f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
+
+    using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        ADataType,
+        BDataType,
+        ck::Tuple<>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::Scale>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{scale};
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        std::array<const void*, 0>{},
+                                                        e_device_buf.GetDeviceBuffer(),
+                                                        a_ms_ks_lengths,
+                                                        a_ms_ks_strides,
+                                                        b_ns_ks_lengths,
+                                                        b_ns_ks_strides,
+                                                        std::array<std::vector<ck::index_t>, 0>{},
+                                                        std::array<std::vector<ck::index_t>, 0>{},
+                                                        e_ms_ns_lengths,
+                                                        e_ms_ns_strides,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            ck::index_t M = ck::accumulate_n<ck::index_t>(
+                e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+            ck::index_t N = ck::accumulate_n<ck::index_t>(
+                e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+            ck::index_t K = ck::accumulate_n<ck::index_t>(
+                a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return 0;
+}
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index 0bde4919a..f176cb91e 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -26,6 +26,7 @@ using Empty_Tuple = ck::Tuple<>;
 using F16_Tuple     = ck::Tuple<F16>;
 using F16_F16_Tuple = ck::Tuple<F16, F16>;
 
+using F64_Tuple     = ck::Tuple<F64>;
 using F32_Tuple     = ck::Tuple<F32>;
 using I32_Tuple     = ck::Tuple<I32>;
 using I32_F32_Tuple = ck::Tuple<I32, F32>;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
index a0cea7e39..c116d999d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
@@ -19,6 +19,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
+// float
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                            2,
@@ -67,6 +68,55 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn
                                                            PassThrough,
                                                            Bilinear>>>& instances);
 
+// double
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances);
+
 // Contraction + Bilinear
 template <index_t NumDimM,
           index_t NumDimN,
@@ -118,6 +168,22 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
             }
         }
 
+        if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double> &&
+                     is_same_v<DDataType, double> && is_same_v<EDataType, double>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
+                    op_ptrs);
+                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
+                    op_ptrs);
+                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
+                    op_ptrs);
+                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
+                    op_ptrs);
+            }
+        }
+
         return op_ptrs;
     }
 };
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
index e921ecd47..e3f07606c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
@@ -19,6 +19,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
+// float
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                            2,
@@ -67,6 +68,55 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instanc
                                                            PassThrough,
                                                            Scale>>>& instances);
 
+// double
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances);
+
 // Contraction + Scale
 template <index_t NumDimM,
           index_t NumDimN,
@@ -117,6 +167,22 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
             }
         }
 
+        if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double> &&
+                     is_same_v<EDataType, double>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
+                    op_ptrs);
+                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
+                    op_ptrs);
+                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
+                    op_ptrs);
+                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
+                    op_ptrs);
+            }
+        }
+
         return op_ptrs;
     }
 };
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt
index ffd6a6a7b..d2a0a3d0f 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt
@@ -1,7 +1,13 @@
 add_instance_library(device_contraction_bilinear_instance
+    #float
     device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
     device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
     device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
     device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
+    #double
+    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
+    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
+    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
+    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
 )
 
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
new file mode 100644
index 000000000..093b2f0e9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F64       = double;
+using F64_Tuple = ck::Tuple<F64>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// k/k/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    32,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    64,    16,   2,   2,   16,   16,    2,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>
+    // clang-format on
+    >;
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
new file mode 100644
index 000000000..0f683e5c2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F64       = double;
+using F64_Tuple = ck::Tuple<F64>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// k/n/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   1,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   1,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
+    // clang-format on
+    >;
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
new file mode 100644
index 000000000..e384993ae
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F64       = double;
+using F64_Tuple = ck::Tuple<F64>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// m/k/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   2,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   2,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>
+    // clang-format on
+    >;
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
new file mode 100644
index 000000000..92e39c173
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F64       = double;
+using F64_Tuple = ck::Tuple<F64>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// m/n/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
+    // clang-format on
+    >;
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt
index 7ad660548..31f6a0fcd 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt
@@ -1,7 +1,13 @@
 add_instance_library(device_contraction_scale_instance
+    #float
     device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
     device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
     device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
     device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+    #double
+    device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
+    device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
+    device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
+    device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
 )
 
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
new file mode 100644
index 000000000..0aa927155
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F64         = double;
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// k/k/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    32,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    32,    64,    16,   2,   2,   16,   16,    2,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>
+    // clang-format on
+    >;
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
new file mode 100644
index 000000000..b84ea274c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F64         = double;
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// k/n/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   1,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   1,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
+    // clang-format on
+    >;
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
new file mode 100644
index 000000000..578469997
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F64         = double;
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// m/k/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   2,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   2,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>
+    // clang-format on
+    >;
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
new file mode 100644
index 000000000..8e5a19313
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F64         = double;
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// m/n/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
+    // clang-format on
+    >;
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
-- 
GitLab


From 3248387bbbe27dcc059eb87bad1874e126420f03 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Thu, 6 Apr 2023 17:14:11 -0700
Subject: [PATCH 12/71] Issue #666: Revert "simplify karg in device/grid of
 split-k op (#644)" (#665)

This reverts commit bb5530af91352dca062b791313d9b77700335ae9.
---
 .../impl/device_gemm_xdl_splitk_c_shuffle.hpp | 466 +++++++++++++++--
 .../gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp  | 484 +++++-------------
 ...l_splitk_f16_f16_f16_mk_kn_mn_instance.cpp |  27 +-
 3 files changed, 539 insertions(+), 438 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
index 1f08cec67..0d2aeaeb7 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -73,18 +73,157 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
     static constexpr auto I2 = Number<2>{};
     static constexpr auto I3 = Number<3>{};
 
+    static constexpr auto K1Number = Number<K1>{};
+
+    static auto
+    MakeAGridDescriptor_KBatch_K0_M_K1(index_t M, index_t K, index_t StrideA, int KBatch, int KPad)
+    {
+        assert(KPad % (K1 * KBatch) == 0);
+
+        const index_t K0 = KPad / (K1 * KBatch);
+
+        const auto a_grid_desc_m_k = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto a_grid_desc_m_kpad = transform_tensor_descriptor(
+            a_grid_desc_m_k,
+            make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+            return transform_tensor_descriptor(
+                a_grid_desc_m_kpad,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
+                           make_right_pad_transform(M, PadM)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                a_grid_desc_m_kpad,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+    }
+
+    static auto
+    MakeBGridDescriptor_KBatch_K0_N_K1(index_t K, index_t N, index_t StrideB, int KBatch, int KPad)
+    {
+        assert(KPad % (K1 * KBatch) == 0);
+
+        const index_t K0 = KPad / (K1 * KBatch);
+
+        const auto b_grid_desc_k_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
+            }
+        }();
+
+        const auto b_grid_desc_kpad_n = transform_tensor_descriptor(
+            b_grid_desc_k_n,
+            make_tuple(make_right_pad_transform(K, KPad - K), make_pass_through_transform(N)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+            return transform_tensor_descriptor(
+                b_grid_desc_kpad_n,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
+                           make_right_pad_transform(N, PadN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                b_grid_desc_kpad_n,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
+    static auto GetKPad(index_t K, index_t KBatch)
+    {
+        const index_t K0   = math::integer_divide_ceil(K, K1 * K0PerBlock * KBatch) * K0PerBlock;
+        const index_t KPad = KBatch * K0 * K1;
+        return KPad;
+    }
+
+    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_KBatch_K0_M_K1(1, 1, 1, 1, 1));
+    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_KBatch_K0_N_K1(1, 1, 1, 1, 1));
+    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+
+    // GridwiseGemm
     using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
         BlockSize,
         ADataType, // TODO: distinguish A/B datatype
         AccDataType,
         CDataType,
-        ALayout,
-        BLayout,
-        CLayout,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
         AElementwiseOperation,
         BElementwiseOperation,
         CElementwiseOperation,
-        GemmSpec,
         MPerBlock,
         NPerBlock,
         K0PerBlock,
@@ -114,64 +253,236 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
         CBlockTransferScalarPerVector_NWaveNPerXDL,
         CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    // GridwiseGemm
+    using GridwiseGemmAtomicAdd = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum::AtomicAdd,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXDL,
+        NPerXDL,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        CShuffleMRepeatPerShuffle,
+        CShuffleNRepeatPerShuffle,
+        CBlockTransferScalarPerVector_NWaveNPerXDL,
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
+
+    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        decltype(GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}));
+
+    using Block2CTileMap = typename GridwiseGemm::CBlockClusterAdaptor;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 index_t M,
+                 index_t N,
+                 index_t K,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 index_t M01,
+                 index_t N01,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op,
+                 index_t k_batch)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              a_grid_desc_kbatch_k0_m_k1_{},
+              b_grid_desc_kbatch_k0_n_k1_{},
+              c_grid_desc_m_n_{},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_ctile_map_{},
+              M01_{M01},
+              N01_{N01},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op},
+              k_batch_{k_batch}
+        {
+            int KPad = DeviceGemmXdlSplitKCShuffle::GetKPad(K, k_batch_);
+
+            a_grid_desc_kbatch_k0_m_k1_ =
+                DeviceGemmXdlSplitKCShuffle::MakeAGridDescriptor_KBatch_K0_M_K1(
+                    M, K, StrideA, k_batch_, KPad);
+            b_grid_desc_kbatch_k0_n_k1_ =
+                DeviceGemmXdlSplitKCShuffle::MakeBGridDescriptor_KBatch_K0_N_K1(
+                    K, N, StrideB, k_batch_, KPad);
+            c_grid_desc_m_n_ = DeviceGemmXdlSplitKCShuffle::MakeCGridDescriptor_M_N(M, N, StrideC);
+
+            block_2_ctile_map_ =
+                GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n_, M01, N01, k_batch_);
+
+            if(GridwiseGemm::CheckValidity(a_grid_desc_kbatch_k0_m_k1_,
+                                           b_grid_desc_kbatch_k0_n_k1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_K0_M_K1 a_grid_desc_kbatch_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_kbatch_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        Block2CTileMap block_2_ctile_map_;
+        index_t M01_;
+        index_t N01_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+        index_t k_batch_;
+    };
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
+        using Argument = DeviceGemmXdlSplitKCShuffle::Argument;
 
-        void Print(const Argument& karg) { karg.Print(); }
+        void Print(const Argument& arg)
+        {
+            std::cout << "arg.a_grid_desc_kbatch_k0_m_k1_{"
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0) << ", "
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1) << ", "
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I2) << ", "
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I3) << "}" << std::endl;
+
+            std::cout << "arg.b_grid_desc_kbatch_k0_n_k1_{"
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I0) << ", "
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I1) << ", "
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I2) << ", "
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I3) << "}" << std::endl;
+
+            std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                      << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+        }
 
-        float Run(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
-                Print(karg);
+                Print(arg);
             }
 
-            const auto kbatch = karg.k_batch;
+            const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
 
-            if(!GridwiseGemm::CheckValidity(karg))
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
+                                            arg.b_grid_desc_kbatch_k0_n_k1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
             {
                 throw std::runtime_error(
-                    "wrong! GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 has invalid "
-                    "setting");
+                    "wrong! GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 has invalid setting");
             }
 
-            index_t gdx, gdy, gdz;
-            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(karg);
-            const auto K0           = karg.K0;
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K0 = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
 
             const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
 
             float ave_time = 0;
 
             const auto Run = [&](const auto& kernel) {
-                if(kbatch > 1)
-                    hipGetErrorString(
-                        hipMemset(karg.p_c_grid, 0, karg.M * karg.N * sizeof(CDataType)));
-
-                ave_time = launch_and_time_kernel(
-                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg);
+                hipGetErrorString(hipMemset(
+                    arg.p_c_grid_,
+                    0,
+                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
+                        sizeof(CDataType)));
+
+                ave_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.a_grid_desc_kbatch_k0_m_k1_,
+                                           arg.b_grid_desc_kbatch_k0_n_k1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.block_2_ctile_map_);
             };
 
             if(has_main_k0_block_loop)
             {
                 if(kbatch == 1)
                 {
-                    const auto kernel =
-                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
-                                                             true,
-                                                             InMemoryDataOperationEnum::Set>;
+                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
+                        GridwiseGemm,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
+                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        AElementwiseOperation,
+                        BElementwiseOperation,
+                        CElementwiseOperation,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
+                        true>;
 
                     Run(kernel);
                 }
                 else
                 {
-                    const auto kernel =
-                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
-                                                             true,
-                                                             InMemoryDataOperationEnum::AtomicAdd>;
+                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
+                        GridwiseGemmAtomicAdd,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
+                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        AElementwiseOperation,
+                        BElementwiseOperation,
+                        CElementwiseOperation,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
+                        true>;
 
                     Run(kernel);
                 }
@@ -180,19 +491,37 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
             {
                 if(kbatch == 1)
                 {
-                    const auto kernel =
-                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
-                                                             false,
-                                                             InMemoryDataOperationEnum::Set>;
+                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
+                        GridwiseGemm,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
+                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        AElementwiseOperation,
+                        BElementwiseOperation,
+                        CElementwiseOperation,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
+                        false>;
 
                     Run(kernel);
                 }
                 else
                 {
-                    const auto kernel =
-                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
-                                                             false,
-                                                             InMemoryDataOperationEnum::AtomicAdd>;
+                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
+                        GridwiseGemmAtomicAdd,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
+                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        AElementwiseOperation,
+                        BElementwiseOperation,
+                        CElementwiseOperation,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
+                        false>;
 
                     Run(kernel);
                 }
@@ -215,9 +544,12 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
         return true;
     }
 
-    static bool IsSupportedArgument(const Argument& karg)
+    static bool IsSupportedArgument(const Argument& arg)
     {
-        return GridwiseGemm::CheckValidity(karg);
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
+                                           arg.b_grid_desc_kbatch_k0_n_k1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
     }
 
     // polymorphic
@@ -235,9 +567,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                              index_t StrideA,
                              index_t StrideB,
                              index_t StrideC,
-                             AElementwiseOperation,
-                             BElementwiseOperation,
-                             CElementwiseOperation,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op,
                              index_t KBatch)
     {
         return Argument{p_a,
@@ -249,10 +581,11 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                         StrideA,
                         StrideB,
                         StrideC,
-                        GridwiseGemm::CalculateMPadded(M),
-                        GridwiseGemm::CalculateNPadded(N),
-                        GridwiseGemm::CalculateKPadded(K),
-                        GridwiseGemm::CalculateK0(K, KBatch),
+                        1,
+                        1,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op,
                         KBatch};
     }
 
@@ -268,9 +601,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                                       index_t StrideA,
                                                       index_t StrideB,
                                                       index_t StrideC,
-                                                      AElementwiseOperation,
-                                                      BElementwiseOperation,
-                                                      CElementwiseOperation,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op,
                                                       ck::index_t KBatch = 1) override
     {
         return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
@@ -282,10 +615,11 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                           StrideA,
                                           StrideB,
                                           StrideC,
-                                          GridwiseGemm::CalculateMPadded(M),
-                                          GridwiseGemm::CalculateNPadded(N),
-                                          GridwiseGemm::CalculateKPadded(K),
-                                          GridwiseGemm::CalculateK0(K, KBatch),
+                                          1,
+                                          1,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op,
                                           KBatch);
     }
 
@@ -296,7 +630,31 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
     }
 
     // polymorphic
-    std::string GetTypeString() const override { return GridwiseGemm::GetTypeString(); }
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGemmXdlSplitKCShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock << ", "
+            << K1 << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << ABlockTransferDstScalarPerVector_K1 << ", "
+            << BBlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferDstScalarPerVector_K1
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
index 727f180e9..190194f1e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -18,23 +18,60 @@
 namespace ck {
 
 template <typename GridwiseGemm,
-          bool HasMainKBlockLoop,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation>
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_B_K0_M_K1,
+          typename BGridDesc_B_K0_N_K1,
+          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename CBlockClusterAdaptor,
+          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_v2r4r2_simplified(typename GridwiseGemm::Argument karg)
+        kernel_gemm_xdlops_v2r4r2(const FloatAB* __restrict__ p_a_grid,
+                                  const FloatAB* __restrict__ p_b_grid,
+                                  FloatC* __restrict__ p_c_grid,
+                                  const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
+                                  const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
+                                  const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                  const AElementwiseOperation a_element_op,
+                                  const BElementwiseOperation b_element_op,
+                                  const CElementwiseOperation c_element_op,
+                                  const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
-    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
-
-    __shared__ uint8_t p_shared[shared_size];
-
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
-        karg, static_cast<void*>(p_shared));
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                  p_b_grid,
+                                                  p_c_grid,
+                                                  static_cast<void*>(p_shared_block),
+                                                  a_b_k0_m_k1_grid_desc,
+                                                  b_b_k0_n_k1_grid_desc,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op,
+                                                  c_block_cluster_adaptor);
 #else
-    ignore = karg;
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = a_b_k0_m_k1_grid_desc;
+    ignore = b_b_k0_n_k1_grid_desc;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = c_block_cluster_adaptor;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 
@@ -42,13 +79,13 @@ template <index_t BlockSize,
           typename FloatAB,
           typename FloatAcc,
           typename FloatC,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AGridDesc_B_K0_M_K1,
+          typename BGridDesc_B_K0_N_K1,
+          typename CMNGridDesc,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
-          tensor_operation::device::GemmSpecialization GemmSpec,
           index_t MPerBlock,
           index_t NPerBlock,
           index_t K0PerBlock,
@@ -89,238 +126,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
     static constexpr auto I7 = Number<7>{};
 
     // K1 should be Number<...>
-    static constexpr auto K1  = Number<K1Value>{};
-    static constexpr auto M01 = 1;
-    static constexpr auto N01 = 1;
+    static constexpr auto K1 = Number<K1Value>{};
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        const FloatAB* p_a_grid;
-        const FloatAB* p_b_grid;
-        FloatC* p_c_grid;
-        index_t M;
-        index_t N;
-        index_t K;
-        index_t StrideA;
-        index_t StrideB;
-        index_t StrideC;
-        index_t MPadded;
-        index_t NPadded;
-        index_t KPadded;
-        index_t K0;
-        index_t k_batch;
-
-        Argument(const FloatAB* p_a_grid_,
-                 const FloatAB* p_b_grid_,
-                 FloatC* p_c_grid_,
-                 index_t M_,
-                 index_t N_,
-                 index_t K_,
-                 index_t StrideA_,
-                 index_t StrideB_,
-                 index_t StrideC_,
-                 index_t MPadded_,
-                 index_t NPadded_,
-                 index_t KPadded_,
-                 index_t K0_,
-                 index_t k_batch_)
-            : p_a_grid(p_a_grid_),
-              p_b_grid(p_b_grid_),
-              p_c_grid(p_c_grid_),
-              M(M_),
-              N(N_),
-              K(K_),
-              StrideA(StrideA_),
-              StrideB(StrideB_),
-              StrideC(StrideC_),
-              MPadded(MPadded_),
-              NPadded(NPadded_),
-              KPadded(KPadded_),
-              K0(K0_),
-              k_batch(k_batch_)
-        {
-        }
-
-        void Print() const
-        {
-            std::cout << "arg {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KP:" << KPadded << ", "
-                      << "K0:" << K0 << ", "
-                      << "KB:" << k_batch << "}" << std::endl;
-        }
-    };
-
-    __host__ __device__ static auto CalculateGridSize(const Argument& karg)
-    {
-        return std::make_tuple(math::integer_divide_ceil(karg.N, NPerBlock),
-                               math::integer_divide_ceil(karg.M, MPerBlock),
-                               karg.k_batch);
-    }
-
-    // prefer this to be called on host
-    __host__ __device__ static auto CalculateMPadded(index_t M)
-    {
-        return (M + MPerBlock - 1) / MPerBlock * MPerBlock;
-    }
-
-    __host__ __device__ static auto CalculateNPadded(index_t N)
-    {
-        return (N + NPerBlock - 1) / NPerBlock * NPerBlock;
-    }
-
-    __host__ __device__ static auto CalculateK0(index_t K, index_t K_Batch = 1)
-    {
-        // k_batch * k0 * k0_per_block * k1
-        auto K_t = K_Batch * K0PerBlock * K1;
-        return (K + K_t - 1) / K_t * K0PerBlock;
-    }
-
-    __host__ __device__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
-    {
-        auto K0 = CalculateK0(K, K_Batch);
-        return K_Batch * K0 * K1;
-    }
-
-    __host__ __device__ static auto MakeAGridDescriptor_KBatch_K0_M_K1(index_t M,
-                                                                       index_t MPad,
-                                                                       index_t K,
-                                                                       index_t StrideA,
-                                                                       index_t KBatch,
-                                                                       index_t K0,
-                                                                       index_t KPad)
-    {
-        const auto a_grid_desc_m_k = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
-            }
-        }();
-
-        const auto a_grid_desc_m_kpad = transform_tensor_descriptor(
-            a_grid_desc_m_k,
-            make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
-                     GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
-                     GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
-                     GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)
-        {
-            // const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-            return transform_tensor_descriptor(
-                a_grid_desc_m_kpad,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
-                           make_right_pad_transform(M, MPad - M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                a_grid_desc_m_kpad,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
-                           make_pass_through_transform(M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-    }
-
-    __host__ __device__ static auto MakeBGridDescriptor_KBatch_K0_N_K1(index_t K,
-                                                                       index_t NPad,
-                                                                       index_t N,
-                                                                       index_t StrideB,
-                                                                       index_t KBatch,
-                                                                       index_t K0,
-                                                                       index_t KPad)
-    {
-        const auto b_grid_desc_k_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
-            }
-        }();
-
-        const auto b_grid_desc_kpad_n = transform_tensor_descriptor(
-            b_grid_desc_k_n,
-            make_tuple(make_right_pad_transform(K, KPad - K), make_pass_through_transform(N)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
-                     GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
-                     GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
-                     GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)
-        {
-            // const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-            return transform_tensor_descriptor(
-                b_grid_desc_kpad_n,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
-                           make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                b_grid_desc_kpad_n,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
-                           make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-    }
-
-    __host__ __device__ static auto
-    MakeCGridDescriptor_M_N(index_t M, index_t N, index_t MPad, index_t NPad, index_t StrideC)
-    {
-        const auto c_grid_desc_m_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
-            }
-        }();
-
-        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
-        {
-            return transform_tensor_descriptor(c_grid_desc_m_n,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-    }
-
     __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
     {
         constexpr auto max_lds_align = K1;
@@ -369,68 +178,45 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                          c_block_size * sizeof(FloatC));
     }
 
-    __host__ __device__ static constexpr bool CheckValidity(const Argument& karg)
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
+                  const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
+                  const CMNGridDesc& c_m_n_grid_desc,
+                  const Block2CTileMap& block_2_ctile_map)
     {
-        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
-        {
-            if(!(karg.M % MPerBlock == 0))
-                return false;
-        }
-        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
-        {
-            if(!(karg.N % NPerBlock == 0))
-                return false;
-        }
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
 
-        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
-        {
-            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
-                return false;
-        }
-        else
-        {
-            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
-                return false;
-        }
+        static_assert((MPerBlock % (MPerXDL * MRepeat) == 0) &&
+                          (NPerBlock % (NRepeat * NPerXDL)) == 0,
+                      "Invalid tuning param!");
 
-        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-        {
-            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
-                return false;
-        }
-        else
-        {
-            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
-                return false;
-        }
+        const auto M      = a_b_k0_m_k1_grid_desc.GetLength(I2);
+        const auto N      = b_b_k0_n_k1_grid_desc.GetLength(I2);
+        const auto K0     = a_b_k0_m_k1_grid_desc.GetLength(I1);
+        const auto KBatch = a_b_k0_m_k1_grid_desc.GetLength(I0);
 
-        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-        {
-            if(karg.N % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
-                return false;
-        }
-        else
+        if(!(M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
+             K0 == b_b_k0_n_k1_grid_desc.GetLength(I1) &&
+             K1 == a_b_k0_m_k1_grid_desc.GetLength(I3) &&
+             K1 == b_b_k0_n_k1_grid_desc.GetLength(I3) &&
+             KBatch == b_b_k0_n_k1_grid_desc.GetLength(I0)))
+            return false;
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0))
+            return false;
+
+        if(!block_2_ctile_map.CheckValidity(c_m_n_grid_desc))
         {
-            if(karg.M % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
-                return false;
+            return false;
         }
 
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
         return true;
     }
 
-    __host__ __device__ static auto GetKPad(index_t K, index_t KBatch)
-    {
-        const index_t K0   = math::integer_divide_ceil(K, K1 * K0PerBlock * KBatch) * K0PerBlock;
-        const index_t KPad = KBatch * K0 * K1;
-        return KPad;
-    }
-
     __host__ __device__ static constexpr bool CalculateHasMainK0BlockLoop(index_t K0)
     {
         const bool has_main_k0_block_loop = K0 > K0PerBlock;
@@ -438,9 +224,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         return has_main_k0_block_loop;
     }
 
-    template <typename CGridDesc>
     __host__ __device__ static constexpr auto
-    MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc& c_m_n_grid_desc)
+    MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(const CMNGridDesc& c_m_n_grid_desc)
     {
         const auto M = c_m_n_grid_desc.GetLength(I0);
         const auto N = c_m_n_grid_desc.GetLength(I1);
@@ -457,11 +242,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
     }
 
     // return block_id to C matrix tile idx (m0, n0) mapping
-    template <typename CGridDesc>
     __host__ __device__ static constexpr auto MakeCBlockClusterAdaptor(
-        const CGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
+        const CMNGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
     {
-        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc>(
+        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CMNGridDesc>(
             c_m_n_grid_desc, 8, KBatch);
     }
 
@@ -478,25 +262,24 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                        Number<CShuffleNRepeatPerShuffle * NWave * NPerXDL>{}));
     }
 
-    template <bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation>
-    __device__ static void Run(const Argument& karg, void* __restrict__ p_shared_block)
+    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        decltype(MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CMNGridDesc{}));
+    using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1, 1));
+
+    template <bool HasMainKBlockLoop>
+    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               void* __restrict__ p_shared_block,
+                               const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
+                               const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
+                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const AElementwiseOperation& a_element_op,
+                               const BElementwiseOperation& b_element_op,
+                               const CElementwiseOperation& c_element_op,
+                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
     {
-        const FloatAB* p_a_grid          = karg.p_a_grid;
-        const FloatAB* p_b_grid          = karg.p_b_grid;
-        FloatC* p_c_grid                 = karg.p_c_grid;
-        const auto a_b_k0_m_k1_grid_desc = MakeAGridDescriptor_KBatch_K0_M_K1(
-            karg.M, karg.MPadded, karg.K, karg.StrideA, karg.k_batch, karg.K0, karg.KPadded);
-        const auto b_b_k0_n_k1_grid_desc = MakeBGridDescriptor_KBatch_K0_N_K1(
-            karg.K, karg.NPadded, karg.N, karg.StrideB, karg.k_batch, karg.K0, karg.KPadded);
-        const auto c_grid_desc_m_n =
-            MakeCGridDescriptor_M_N(karg.M, karg.N, karg.MPadded, karg.NPadded, karg.StrideC);
-
-        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
-            MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n);
-        const AElementwiseOperation a_element_op = AElementwiseOperation{};
-        const BElementwiseOperation b_element_op = BElementwiseOperation{};
-        const CElementwiseOperation c_element_op = CElementwiseOperation{};
-
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -506,16 +289,26 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
 
         const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1);
 
-        const index_t block_m_id = __builtin_amdgcn_readfirstlane(blockIdx.y);
-        const index_t block_n_id = __builtin_amdgcn_readfirstlane(blockIdx.x);
-        const index_t k_batch_id = __builtin_amdgcn_readfirstlane(blockIdx.z);
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            c_block_cluster_adaptor.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!c_block_cluster_adaptor.ValidCTileIndex(
+               make_tuple(block_work_idx[I1], block_work_idx[I2]),
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t k_batch_id = block_work_idx[I0];
 
         // HACK: this force m/n_block_data_idx_on_grid into SGPR
         const index_t m_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * MPerBlock);
 
         const index_t n_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
+            __builtin_amdgcn_readfirstlane(block_work_idx[I2] * NPerBlock);
 
         // lds max alignment
         constexpr auto max_lds_align = K1;
@@ -651,6 +444,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
         //       register
         // sanity check
+
         auto blockwise_gemm =
             BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
                                                                 FloatAB,
@@ -853,7 +647,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                 {c_block_desc_mblock_mperblock_nblock_nperblock,
                  make_multi_index(0, 0, 0, 0),
                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                 make_multi_index(block_m_id, 0, block_n_id, 0),
+                 make_multi_index(block_work_idx[I1], 0, block_work_idx[I2], 0),
                  c_element_op};
 
             constexpr auto mxdlperwave_forward_step =
@@ -922,48 +716,6 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
             });
         }
     }
-
-    template <typename Layout>
-    struct LStr
-    {
-        static std::string Get() { return ""; }
-    };
-
-    template <>
-    struct LStr<ck::tensor_layout::gemm::RowMajor>
-    {
-        static std::string Get() { return "R"; }
-    };
-
-    template <>
-    struct LStr<ck::tensor_layout::gemm::ColumnMajor>
-    {
-        static std::string Get() { return "C"; }
-    };
-
-    static std::string GetTypeString()
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "GemmXdlSplitKCShuffle_"
-            << getGemmSpecializationString(GemmSpec) << "_"
-            << std::string(ALayout::name)[0]
-            << std::string(BLayout::name)[0]
-            << std::string(CLayout::name)[0]
-            << "_"
-            << "B" << BlockSize << "_"
-            << "Vec" << ABlockTransferSrcScalarPerVector << "x"
-            << BBlockTransferSrcScalarPerVector << "x"
-            << CBlockTransferScalarPerVector_NWaveNPerXDL << "_"
-            << MPerBlock << "x"
-            << NPerBlock << "x"
-            << K0PerBlock << "x"
-            << K1 ;
-        // clang-format on
-
-        return str.str();
-    }
 };
 
 } // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
index c4680db83..9b5ff4048 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -26,8 +26,7 @@ using S = ck::Sequence<Is...>;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
-// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
 using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple<
@@ -36,22 +35,14 @@ using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple<
         //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,    64,   192,     4,  8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   192,    64,     4,  8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    32,   192,     4,  8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   192,    32,     4,  8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    32,    64,     4,  8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    64,    32,     4,  8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>
     // clang-format on
     >;
 
-- 
GitLab


From ed3a2e52265e11daa366f47b082141a652b67c58 Mon Sep 17 00:00:00 2001
From: rocking5566 <ChunYu.Lai@amd.com>
Date: Mon, 10 Apr 2023 21:02:17 +0800
Subject: [PATCH 13/71] Groupnorm + swish external api (#668)

* Rename to proper naming

* Add example of groupnorm + swish

* Extract duplicate code in example

* Add groupnorm + swish instances

* Ractor instance generation, split into multiple cpp file

* Add external api and client example

* Refine profiler message

* Use ck math version of exp

* Refine problem size in example

* Add host version of exp
---
 client_example/18_groupnorm/CMakeLists.txt    |   2 +
 .../18_groupnorm/groupnorm_swish.cpp          | 169 ++++++++++++++++++
 example/42_groupnorm/CMakeLists.txt           |   3 +-
 example/42_groupnorm/common.hpp               |  23 +++
 .../groupnorm_sigmoid_mul_fp16.cpp            |  56 ++++++
 example/42_groupnorm/groupnorm_swish_fp16.cpp |  40 +++++
 ...oid_fp16.cpp => run_groupnorm_example.inc} |  79 +-------
 .../element/unary_element_wise_operation.hpp  |  19 +-
 include/ck/utility/math.hpp                   |   4 +
 .../device_operation_instance_factory.hpp     |   1 +
 .../gpu/normalization_swish.hpp               |  81 +++++++++
 .../gpu/normalization/CMakeLists.txt          |  10 +-
 .../device_groupnorm_f16_instance.cpp         |  23 +++
 .../device_groupnorm_f32_instance.cpp         |  23 +++
 .../device_groupnorm_swish_f16_instance.cpp   |  23 +++
 .../device_groupnorm_swish_f32_instance.cpp   |  23 +++
 .../device_layernorm2d_f16_instance.cpp       |  23 +++
 .../device_layernorm2d_f32_instance.cpp       |  23 +++
 .../device_layernorm4d_f16_instance.cpp       |  23 +++
 .../device_layernorm4d_f32_instance.cpp       |  23 +++
 .../device_normalization_f16_instance.cpp     |  70 --------
 ....cpp => normalization_instance_common.hpp} |  52 +++---
 .../profiler/profile_groupnorm_impl.hpp       |   6 +-
 23 files changed, 626 insertions(+), 173 deletions(-)
 create mode 100644 client_example/18_groupnorm/CMakeLists.txt
 create mode 100644 client_example/18_groupnorm/groupnorm_swish.cpp
 create mode 100644 example/42_groupnorm/common.hpp
 create mode 100644 example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
 create mode 100644 example/42_groupnorm/groupnorm_swish_fp16.cpp
 rename example/42_groupnorm/{groupnorm_sigmoid_fp16.cpp => run_groupnorm_example.inc} (54%)
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp
 rename library/src/tensor_operation_instance/gpu/normalization/{device_normalization_f32_instance.cpp => normalization_instance_common.hpp} (53%)

diff --git a/client_example/18_groupnorm/CMakeLists.txt b/client_example/18_groupnorm/CMakeLists.txt
new file mode 100644
index 000000000..17c88cb61
--- /dev/null
+++ b/client_example/18_groupnorm/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_groupnorm_swish groupnorm_swish.cpp)
+target_link_libraries(client_groupnorm_swish PRIVATE composable_kernel::device_operations)
diff --git a/client_example/18_groupnorm/groupnorm_swish.cpp b/client_example/18_groupnorm/groupnorm_swish.cpp
new file mode 100644
index 000000000..8a873e6ac
--- /dev/null
+++ b/client_example/18_groupnorm/groupnorm_swish.cpp
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/normalization_swish.hpp"
+
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using Swish           = ck::tensor_operation::element_wise::Swish;
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    ck::index_t N = 32;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t G = 64;
+    ck::index_t C = 128;
+
+    std::size_t xy_size         = N * H * W * G * C;
+    std::size_t gamma_beta_size = G * C;
+
+    std::vector<ck::index_t> xy_strides         = {H * W * G * C, W * G * C, G * C, C, 1};
+    std::vector<ck::index_t> gamma_beta_strides = {0, 0, 0, C, 1};
+
+    SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size);
+    SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_beta_size);
+    SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * gamma_beta_size);
+    SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
+                                                                       GammaDataType,
+                                                                       BetaDataType,
+                                                                       ComputeDataType,
+                                                                       YDataType,
+                                                                       Swish,
+                                                                       Rank,
+                                                                       NumReduceDim>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
+                                                        xy_strides,         // xStrides
+                                                        gamma_beta_strides, // gammaStrides
+                                                        gamma_beta_strides, // betaStrides
+                                                        xy_strides,         // yStrides
+                                                        {1, 2, 4},          // reduceDims
+                                                        1e-6,
+                                                        x_device_buf.GetDeviceBuffer(),
+                                                        gamma_device_buf.GetDeviceBuffer(),
+                                                        beta_device_buf.GetDeviceBuffer(),
+                                                        y_device_buf.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        nullptr,
+                                                        Swish{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_byte =
+                sizeof(XDataType) * xy_size + sizeof(GammaDataType) * gamma_beta_size +
+                sizeof(BetaDataType) * gamma_beta_size + sizeof(YDataType) * xy_size;
+
+            float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
+                                                        xy_strides,         // xStrides
+                                                        gamma_beta_strides, // gammaStrides
+                                                        gamma_beta_strides, // betaStrides
+                                                        xy_strides,         // yStrides
+                                                        {1, 2, 4},          // reduceDims
+                                                        1e-6,
+                                                        x_device_buf.GetDeviceBuffer(),
+                                                        gamma_device_buf.GetDeviceBuffer(),
+                                                        beta_device_buf.GetDeviceBuffer(),
+                                                        y_device_buf.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        nullptr,
+                                                        Swish{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/example/42_groupnorm/CMakeLists.txt b/example/42_groupnorm/CMakeLists.txt
index c3b7b8259..a9990c5d8 100644
--- a/example/42_groupnorm/CMakeLists.txt
+++ b/example/42_groupnorm/CMakeLists.txt
@@ -1 +1,2 @@
-add_example_executable(example_groupnorm_sigmoid_fp16 groupnorm_sigmoid_fp16.cpp)
+add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp)
+add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp)
diff --git a/example/42_groupnorm/common.hpp b/example/42_groupnorm/common.hpp
new file mode 100644
index 000000000..e159abf3e
--- /dev/null
+++ b/example/42_groupnorm/common.hpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
diff --git a/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
new file mode 100644
index 000000000..b07a26c4c
--- /dev/null
+++ b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+
+struct YElementOp
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(ck::is_same<T, float>::value || ck::is_same<T, double>::value ||
+                          ck::is_same<T, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        T a;
+
+        ck::tensor_operation::element_wise::Sigmoid{}(a, x);
+
+        y = x * a;
+    };
+};
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
+                                                          GammaDataType,
+                                                          BetaDataType,
+                                                          ComputeDataType,
+                                                          YDataType,
+                                                          YElementOp,
+                                                          Rank,
+                                                          NumReduceDim,
+                                                          1024, // BlockSize
+                                                          1,    // ClusterM
+                                                          1024, // ClusterK
+                                                          1,    // SliceM
+                                                          32,   // SliceK
+                                                          1,    // SrcVecDim (0=M, 1=K)
+                                                          2,    // SrcScalarPerVector
+                                                          1,    // GammaVecDim (0=M, 1=K)
+                                                          2,    // GammaScalarPerVector
+                                                          1,    // BetaVecDim (0=M, 1=K)
+                                                          2,    // BetaScalarPerVector
+                                                          2>;   // OutScalarPerVector
+
+#include "run_groupnorm_example.inc"
+
+int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); }
diff --git a/example/42_groupnorm/groupnorm_swish_fp16.cpp b/example/42_groupnorm/groupnorm_swish_fp16.cpp
new file mode 100644
index 000000000..c52243bfb
--- /dev/null
+++ b/example/42_groupnorm/groupnorm_swish_fp16.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using YElementOp      = ck::tensor_operation::element_wise::Swish;
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
+                                                          GammaDataType,
+                                                          BetaDataType,
+                                                          ComputeDataType,
+                                                          YDataType,
+                                                          YElementOp,
+                                                          Rank,
+                                                          NumReduceDim,
+                                                          1024, // BlockSize
+                                                          1,    // ClusterM
+                                                          1024, // ClusterK
+                                                          1,    // SliceM
+                                                          32,   // SliceK
+                                                          1,    // SrcVecDim (0=M, 1=K)
+                                                          2,    // SrcScalarPerVector
+                                                          1,    // GammaVecDim (0=M, 1=K)
+                                                          2,    // GammaScalarPerVector
+                                                          1,    // BetaVecDim (0=M, 1=K)
+                                                          2,    // BetaScalarPerVector
+                                                          2>;   // OutScalarPerVector
+
+#include "run_groupnorm_example.inc"
+
+int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); }
diff --git a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp b/example/42_groupnorm/run_groupnorm_example.inc
similarity index 54%
rename from example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
rename to example/42_groupnorm/run_groupnorm_example.inc
index 35c7c054e..bd7eb98ca 100644
--- a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
+++ b/example/42_groupnorm/run_groupnorm_example.inc
@@ -1,80 +1,15 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <getopt.h>
-
-#include "ck/ck.hpp"
-#include "ck/utility/reduction_enums.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
-#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
-
-#include "ck/library/utility/fill.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_common_util.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
-
-constexpr int Rank         = 5;
-constexpr int NumReduceDim = 3;
-
-using XDataType       = ck::half_t;
-using GammaDataType   = ck::half_t;
-using BetaDataType    = ck::half_t;
-using YDataType       = ck::half_t;
-using ComputeDataType = float;
-
-struct YElementOp
-{
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
-    {
-        static_assert(ck::is_same<T, float>::value || ck::is_same<T, double>::value ||
-                          ck::is_same<T, ck::half_t>::value,
-                      "Data type is not supported by this operation!");
-
-        T a;
+#pragma once
 
-        ck::tensor_operation::element_wise::Sigmoid{}(a, x);
-
-        y = x * a;
-    };
-};
-
-using DeviceInstance =
-    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
-                                                          GammaDataType,
-                                                          BetaDataType,
-                                                          ComputeDataType,
-                                                          YDataType,
-                                                          YElementOp,
-                                                          Rank,
-                                                          NumReduceDim,
-                                                          1024, // BlockSize
-                                                          1,    // ClusterM
-                                                          1024, // ClusterK
-                                                          1,    // SliceM
-                                                          32,   // SliceK
-                                                          1,    // SrcVecDim (0=M, 1=K)
-                                                          2,    // SrcScalarPerVector
-                                                          1,    // GammaVecDim (0=M, 1=K)
-                                                          2,    // GammaScalarPerVector
-                                                          1,    // BetaVecDim (0=M, 1=K)
-                                                          2,    // BetaScalarPerVector
-                                                          2>;   // OutScalarPerVector
-
-int main(int argc, char* argv[])
+int run_groupnorm_example(int argc, char* argv[])
 {
-    ck::index_t N = 2;
-    ck::index_t H = 32;
-    ck::index_t W = 32;
-    ck::index_t G = 32;
-    ck::index_t C = 30;
+    ck::index_t N = 32;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t G = 64;
+    ck::index_t C = 128;
 
     if(argc == 1)
     {
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index f1f3042ad..2987def02 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -316,8 +316,6 @@ struct Sigmoid
 
         y = 1 / (ck::type_convert<T>(1) + exp(-x));
     };
-
-    int32_t divider_ = 1;
 };
 
 struct TanH
@@ -333,6 +331,23 @@ struct TanH
     };
 };
 
+struct Swish
+{
+    Swish(float beta = 1.0f) : beta_(beta) {}
+
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        y = x / (ck::type_convert<T>(1) + ck::math::exp(-beta_ * x));
+    };
+
+    float beta_ = 1.0f;
+};
+
 } // namespace element_wise
 } // namespace tensor_operation
 } // namespace ck
diff --git a/include/ck/utility/math.hpp b/include/ck/utility/math.hpp
index 12203bd7f..72071992f 100644
--- a/include/ck/utility/math.hpp
+++ b/include/ck/utility/math.hpp
@@ -168,6 +168,10 @@ __device__ double exp<double>(double x)
     return exp(x);
 }
 
+static inline __host__ float exp(float x) { return std::expf(x); }
+
+static inline __host__ double exp(double x) { return std::exp(x); }
+
 // greatest common divisor, aka highest common factor
 __host__ __device__ constexpr index_t gcd(index_t x, index_t y)
 {
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index f176cb91e..188643952 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -96,6 +96,7 @@ using FastGelu       = ck::tensor_operation::element_wise::FastGelu;
 using AddMultiply    = ck::tensor_operation::element_wise::AddMultiply;
 using ScaleAdd       = ck::tensor_operation::element_wise::ScaleAdd;
 using Gelu           = ck::tensor_operation::element_wise::Gelu;
+using Swish          = ck::tensor_operation::element_wise::Swish;
 
 template <typename Activation>
 using Activation_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Activation>;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
new file mode 100644
index 000000000..c04a54455
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// FP16
+void add_device_normalization_rank_5_3_swish_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Swish, 5, 3>>>&);
+
+// FP32
+void add_device_normalization_rank_5_3_swish_f32_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Swish, 5, 3>>>&);
+
+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename YDataType,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceNormalization<XDataType,
+                                                      GammaDataType,
+                                                      BetaDataType,
+                                                      F32,
+                                                      YDataType,
+                                                      ck::tensor_operation::element_wise::Swish,
+                                                      Rank,
+                                                      NumReduceDim>>
+{
+    using DeviceOp = DeviceNormalization<XDataType,
+                                         GammaDataType,
+                                         BetaDataType,
+                                         F32,
+                                         YDataType,
+                                         ck::tensor_operation::element_wise::Swish,
+                                         Rank,
+                                         NumReduceDim>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<XDataType, F16> && is_same_v<GammaDataType, F16> &&
+                     is_same_v<BetaDataType, F16> && is_same_v<YDataType, F16>)
+        {
+            if constexpr(Rank == 5 && NumReduceDim == 3)
+            {
+                add_device_normalization_rank_5_3_swish_f16_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<XDataType, F32> && is_same_v<GammaDataType, F32> &&
+                          is_same_v<BetaDataType, F32> && is_same_v<YDataType, F32>)
+        {
+            if constexpr(Rank == 5 && NumReduceDim == 3)
+            {
+                add_device_normalization_rank_5_3_swish_f32_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
index aa0cc1148..6bed36e35 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
@@ -1,4 +1,10 @@
 add_instance_library(device_normalization_instance
-    device_normalization_f16_instance.cpp
-    device_normalization_f32_instance.cpp
+    device_layernorm2d_f16_instance.cpp
+    device_layernorm2d_f32_instance.cpp
+    device_layernorm4d_f16_instance.cpp
+    device_layernorm4d_f32_instance.cpp
+    device_groupnorm_f16_instance.cpp
+    device_groupnorm_f32_instance.cpp
+    device_groupnorm_swish_f16_instance.cpp
+    device_groupnorm_swish_f32_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
new file mode 100644
index 000000000..e9c2112e1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "normalization_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Pass = ck::tensor_operation::element_wise::PassThrough;
+
+void add_device_normalization_rank_5_3_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 5, 3>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 5, 3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
new file mode 100644
index 000000000..79dde38fc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "normalization_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Pass = ck::tensor_operation::element_wise::PassThrough;
+
+void add_device_normalization_rank_5_3_f32_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 5, 3>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 5, 3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
new file mode 100644
index 000000000..6241e0338
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "normalization_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Swish = ck::tensor_operation::element_wise::Swish;
+
+void add_device_normalization_rank_5_3_swish_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Swish, 5, 3>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_normalization_f16_instances<Swish, 5, 3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
new file mode 100644
index 000000000..b64328d5d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "normalization_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Swish = ck::tensor_operation::element_wise::Swish;
+
+void add_device_normalization_rank_5_3_swish_f32_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Swish, 5, 3>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_normalization_f32_instances<Swish, 5, 3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
new file mode 100644
index 000000000..d6a2f6f2c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "normalization_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Pass = ck::tensor_operation::element_wise::PassThrough;
+
+void add_device_normalization_rank_2_1_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 2, 1>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 2, 1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
new file mode 100644
index 000000000..73097828e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "normalization_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Pass = ck::tensor_operation::element_wise::PassThrough;
+
+void add_device_normalization_rank_2_1_f32_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 2, 1>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 2, 1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
new file mode 100644
index 000000000..507a683ee
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "normalization_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Pass = ck::tensor_operation::element_wise::PassThrough;
+
+void add_device_normalization_rank_4_3_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 4, 3>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 4, 3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
new file mode 100644
index 000000000..ca1aa0c25
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "normalization_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Pass = ck::tensor_operation::element_wise::PassThrough;
+
+void add_device_normalization_rank_4_3_f32_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 4, 3>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 4, 3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp
deleted file mode 100644
index beeaa3aa2..000000000
--- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
-#include "ck/utility/data_type.hpp"
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using Pass = ck::tensor_operation::element_wise::PassThrough;
-
-template <typename OutElementwise, index_t Rank, index_t Reduce>
-// clang-format off
-using device_normalization_f16_instances =
-    std::tuple <
-        // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 1, 1, 1, 1, 1, 1, 1, 1>, // irregular size
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 2, 1, 2, 1, 2, 1, 2, 2>,   // irregular size
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 4, 1, 4, 1, 4, 1, 4, 4>,   // irregular size
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8>,
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8>,
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 2, 16, 1, 8, 1, 8, 1, 8, 8>,
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 16, 1, 8, 1, 8, 1, 8, 8>,
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
-        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 16, 1, 8, 1, 8, 1, 8, 8>
-    >;
-// clang-format on
-
-void add_device_normalization_rank_2_1_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 2, 1>>>&
-        instances)
-{
-    add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 2, 1>{});
-}
-
-void add_device_normalization_rank_4_3_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 4, 3>>>&
-        instances)
-{
-    add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 4, 3>{});
-}
-
-void add_device_normalization_rank_5_3_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 5, 3>>>&
-        instances)
-{
-    add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 5, 3>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
similarity index 53%
rename from library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp
rename to library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
index 4d236fb63..a58fb6ca3 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
+#pragma once
+
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
 #include "ck/utility/data_type.hpp"
@@ -12,12 +14,37 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
+using F16 = ck::half_t;
 using F32 = float;
 
-using Pass = ck::tensor_operation::element_wise::PassThrough;
+template <typename OutElementwise, index_t Rank, index_t Reduce>
+using device_normalization_f16_instances =
+    // clang-format off
+    std::tuple <
+        // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 1, 1, 1, 1, 1, 1, 1, 1>, // irregular size
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 2, 1, 2, 1, 2, 1, 2, 2>,   // irregular size
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 4, 1, 4, 1, 4, 1, 4, 4>,   // irregular size
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 2, 16, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 16, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 16, 1, 8, 1, 8, 1, 8, 8>
+        // clang-format on
+        >;
 
 template <typename OutElementwise, index_t Rank, index_t Reduce>
-using device_layernorm_f32_instances = std::tuple<
+using device_normalization_f32_instances = std::tuple<
     // clang-format off
         // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
         DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
@@ -42,27 +69,6 @@ using device_layernorm_f32_instances = std::tuple<
     // clang-format on
     >;
 
-void add_device_normalization_rank_2_1_f32_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 2, 1>>>&
-        instances)
-{
-    add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 2, 1>{});
-}
-
-void add_device_normalization_rank_4_3_f32_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 4, 3>>>&
-        instances)
-{
-    add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 4, 3>{});
-}
-
-void add_device_normalization_rank_5_3_f32_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 5, 3>>>&
-        instances)
-{
-    add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 5, 3>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/profiler/include/profiler/profile_groupnorm_impl.hpp b/profiler/include/profiler/profile_groupnorm_impl.hpp
index 81fec5590..73343f6be 100644
--- a/profiler/include/profiler/profile_groupnorm_impl.hpp
+++ b/profiler/include/profiler/profile_groupnorm_impl.hpp
@@ -190,9 +190,9 @@ bool profile_groupnorm_impl(int do_verification,
 
     if(time_kernel)
     {
-        LogRange(std::cout << "length = ", length, ",") << ", ";
-        std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_avg_time << " ms, "
-                  << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl;
+        LogRange(std::cout << "length = ", length, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
     }
 
     if(num_kernel == 0)
-- 
GitLab


From c54f8bcc25ace7b8d9ee86ddeb72738c87f908bb Mon Sep 17 00:00:00 2001
From: zjing14 <zhangjing14@gmail.com>
Date: Tue, 11 Apr 2023 07:44:43 -0500
Subject: [PATCH 14/71] add a marco to turn on/off denorm fix (off by default)
 (#673)

* add a marco to turn off denorm fix by default

* expose the marco

---------

Co-authored-by: root <root@ctr-ubbsmc15.amd.com>
---
 include/ck/ck.hpp                                            | 5 +++++
 .../gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp       | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 9853f19a4..036ca24a4 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -168,6 +168,11 @@
 // flag to enable (1) or disable (0) the debugging output in some kernels
 #define DEBUG_LOG 0
 
+// denorm test fix, required to work around dissue
+#ifndef CK_WORKAROUND_DENORM_FIX
+#define CK_WORKAROUND_DENORM_FIX 0
+#endif
+
 namespace ck {
 
 enum struct InMemoryDataOperationEnum
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
index d49c96f86..98a71a7c2 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -96,7 +96,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
     // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
     // when mfma if fixed, remove this section and update
     // ABDataTypeAdjusted -> ABDataType throughout this file
-#if defined(__gfx90a__)
+#if CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
     using ABDataTypeAdjusted =
         conditional_t<is_same_v<ABDataType, ck::half_t>, ck::bhalf_t, ABDataType>;
 #else
-- 
GitLab


From c203bf67117ca06b1dbd45b1f88e49c9b8a41db9 Mon Sep 17 00:00:00 2001
From: zjing14 <zhangjing14@gmail.com>
Date: Tue, 11 Apr 2023 07:46:46 -0500
Subject: [PATCH 15/71] fixed quant example (#672)

Co-authored-by: root <root@ctr-ubbsmc15.amd.com>
---
 .../conv2d_fwd_bias_relu_perchannel_quantization.cpp          | 4 ++--
 .../conv2d_fwd_bias_tanh_perchannel_quantization.cpp          | 4 ++--
 .../09_quantization/conv2d_fwd_perchannel_quantization.cpp    | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
index cf6807f0d..a10dd3e00 100644
--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
@@ -73,7 +73,7 @@ int main(int argc, char* argv[])
     SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
     SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
     SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K);
     SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
 
     using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
@@ -203,4 +203,4 @@ int main(int argc, char* argv[])
     }
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
index 7a216f027..a0e1865d3 100644
--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
@@ -76,7 +76,7 @@ int main(int argc, char* argv[])
     SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
     SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
     SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K);
     SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
 
     using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
@@ -206,4 +206,4 @@ int main(int argc, char* argv[])
     }
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
index c1c5a651e..6439c22e7 100644
--- a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
@@ -69,7 +69,7 @@ int main(int argc, char* argv[])
 
     SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
     SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K);
     SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
 
     using DeviceOp =
@@ -196,4 +196,4 @@ int main(int argc, char* argv[])
     }
 
     return 0;
-}
\ No newline at end of file
+}
-- 
GitLab


From fd497f0e796f702b36385567bbc259cbd184bdf7 Mon Sep 17 00:00:00 2001
From: Sam Wu <sam.wu2@amd.com>
Date: Tue, 11 Apr 2023 09:18:38 -0600
Subject: [PATCH 16/71] Add dependabot config and pin rocm-docs-core (#663)

---
 .github/dependabot.yml        | 12 ++++++++++++
 docs/.sphinx/requirements.in  |  2 +-
 docs/.sphinx/requirements.txt | 30 +++++++++++++-----------------
 3 files changed, 26 insertions(+), 18 deletions(-)
 create mode 100644 .github/dependabot.yml

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 000000000..ada22f1b5
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,12 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/" # Location of package manifests
+    open-pull-requests-limit: 10
+    schedule:
+      interval: "daily"
diff --git a/docs/.sphinx/requirements.in b/docs/.sphinx/requirements.in
index 36a9a4577..1905de6e6 100644
--- a/docs/.sphinx/requirements.in
+++ b/docs/.sphinx/requirements.in
@@ -1,2 +1,2 @@
-git+https://github.com/RadeonOpenCompute/rocm-docs-core.git
+rocm-docs-core==0.2.0
 sphinxcontrib-bibtex==2.5.0
diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt
index 8618920ea..d1698b285 100644
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -2,9 +2,9 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile requirements.in
+#    pip-compile .sphinx/requirements.in
 #
-accessible-pygments==0.0.4
+accessible-pygments==0.0.3
     # via pydata-sphinx-theme
 alabaster==0.7.13
     # via sphinx
@@ -20,7 +20,7 @@ babel==2.12.1
     #   sphinx
 backcall==0.2.0
     # via ipython
-beautifulsoup4==4.12.0
+beautifulsoup4==4.11.2
     # via pydata-sphinx-theme
 breathe==4.34.0
     # via rocm-docs-core
@@ -34,7 +34,7 @@ click==8.1.3
     # via
     #   jupyter-cache
     #   sphinx-external-toc
-comm==0.1.3
+comm==0.1.2
     # via ipykernel
 debugpy==1.6.6
     # via ipykernel
@@ -65,13 +65,11 @@ idna==3.4
     # via requests
 imagesize==1.4.1
     # via sphinx
-importlib-metadata==6.1.0
+importlib-metadata==6.0.0
     # via
     #   jupyter-cache
     #   myst-nb
-importlib-resources==5.10.4
-    # via rocm-docs-core
-ipykernel==6.22.0
+ipykernel==6.21.3
     # via myst-nb
 ipython==8.11.0
     # via
@@ -87,7 +85,7 @@ jsonschema==4.17.3
     # via nbformat
 jupyter-cache==0.5.0
     # via myst-nb
-jupyter-client==8.1.0
+jupyter-client==8.0.3
     # via
     #   ipykernel
     #   nbclient
@@ -124,7 +122,7 @@ nbclient==0.5.13
     # via
     #   jupyter-cache
     #   myst-nb
-nbformat==5.8.0
+nbformat==5.7.3
     # via
     #   jupyter-cache
     #   myst-nb
@@ -187,7 +185,7 @@ pyyaml==6.0
     #   myst-parser
     #   pybtex
     #   sphinx-external-toc
-pyzmq==25.0.2
+pyzmq==25.0.1
     # via
     #   ipykernel
     #   jupyter-client
@@ -195,8 +193,8 @@ requests==2.28.2
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core @ git+https://github.com/RadeonOpenCompute/rocm-docs-core.git
-    # via -r requirements.in
+rocm-docs-core==0.2.0
+    # via -r .sphinx/requirements.in
 six==1.16.0
     # via
     #   asttokens
@@ -235,9 +233,7 @@ sphinx-notfound-page==0.8.3
 sphinxcontrib-applehelp==1.0.4
     # via sphinx
 sphinxcontrib-bibtex==2.5.0
-    # via
-    #   -r requirements.in
-    #   rocm-docs-core
+    # via -r .sphinx/requirements.in
 sphinxcontrib-devhelp==1.0.2
     # via sphinx
 sphinxcontrib-htmlhelp==2.0.1
@@ -248,7 +244,7 @@ sphinxcontrib-qthelp==1.0.3
     # via sphinx
 sphinxcontrib-serializinghtml==1.1.5
     # via sphinx
-sqlalchemy==1.4.47
+sqlalchemy==1.4.46
     # via jupyter-cache
 stack-data==0.6.2
     # via ipython
-- 
GitLab


From f5329887133aef6a690aa6dc5e6c0f92d000d8f8 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Tue, 11 Apr 2023 13:41:49 -0700
Subject: [PATCH 17/71] [gtest] suppress unsafe buffer warn (#670)

ref: https://github.com/ROCmSoftwarePlatform/MIOpen/pull/1912
---
 cmake/googletest.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake
index 3c6cb56cc..d6577ac33 100644
--- a/cmake/googletest.cmake
+++ b/cmake/googletest.cmake
@@ -21,6 +21,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
      -Wno-comma
      -Wno-old-style-cast
      -Wno-deprecated
+     -Wno-unsafe-buffer-usage
 )
 message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")
 
-- 
GitLab


From e85178b4ca892a78344271ae64103c9d4d1bfc40 Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Wed, 12 Apr 2023 04:42:47 +0800
Subject: [PATCH 18/71] Add memory index guard in wmma device ops (#667)

---
 .../gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp  | 9 +++++++++
 .../ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp  | 7 +++++++
 2 files changed, 16 insertions(+)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
index 38edace19..d3f81566e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -505,6 +505,15 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle
         }
 
         // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+
+        if(!(a_grid_desc_k0_m_k1.GetElementSpaceSize() * sizeof(ADataType) <= TwoGB &&
+             b_grid_desc_k0_n_k1.GetElementSpaceSize() * sizeof(BDataType) <= TwoGB &&
+             e_grid_desc_m_n.GetElementSpaceSize() * sizeof(EDataType) <= TwoGB))
+        {
+            return false;
+        }
+
         return true;
     }
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
index 1fee302c3..2694aaf6f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
@@ -264,6 +264,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
         }
 
         // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+
+        if(!(a_grid_desc_k0_m_k1.GetElementSpaceSize() * sizeof(ADataType) <= TwoGB &&
+             b_grid_desc_k0_n_k1.GetElementSpaceSize() * sizeof(BDataType) <= TwoGB))
+        {
+            return false;
+        }
         return true;
     }
 
-- 
GitLab


From 03eaee6ae6a162950c8c9afd665878de2c7d1dd1 Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Sat, 15 Apr 2023 21:56:07 -0500
Subject: [PATCH 19/71] Add more macros to turn on/off denorm fix (#678)

Co-authored-by: Rosty Geyyer <rosty.geyyer@amd.com>
---
 .../gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp                | 2 +-
 .../ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
index 2b66898b1..2da92466b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -265,7 +265,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
     // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
     // when mfma if fixed, remove this section and update
     // FloatABAdjusted -> FloatAB throughout this file
-#if defined(__gfx90a__)
+#if CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
     using FloatABAdjusted = conditional_t<is_same_v<FloatAB, ck::half_t>, ck::bhalf_t, FloatAB>;
 #else
     using FloatABAdjusted = FloatAB;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
index 02b008134..51c578385 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -135,7 +135,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
     // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
     // when mfma if fixed, remove this section and update
     // FloatABAdjusted -> FloatAB throughout this file
-#if defined(__gfx90a__)
+#if CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
     using FloatABAdjusted = conditional_t<is_same_v<FloatAB, ck::half_t>, ck::bhalf_t, FloatAB>;
 #else
     using FloatABAdjusted = FloatAB;
-- 
GitLab


From fc26d42a2e1f5f34c21f8f5e7aaa2fb381b2aceb Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Sun, 16 Apr 2023 10:57:34 +0800
Subject: [PATCH 20/71] Fix a typo (#676)

---
 include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
index 2694aaf6f..397ae1c1b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
@@ -266,8 +266,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
         // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
         constexpr long_index_t TwoGB = (long_index_t{1} << 31);
 
-        if(!(a_grid_desc_k0_m_k1.GetElementSpaceSize() * sizeof(ADataType) <= TwoGB &&
-             b_grid_desc_k0_n_k1.GetElementSpaceSize() * sizeof(BDataType) <= TwoGB))
+        if(!(a_grid_desc_k0_m_k1.GetElementSpaceSize() * sizeof(FloatA) <= TwoGB &&
+             b_grid_desc_k0_n_k1.GetElementSpaceSize() * sizeof(FloatB) <= TwoGB))
         {
             return false;
         }
-- 
GitLab


From fd11a4a12a318dee7e0f4e602223846af7934226 Mon Sep 17 00:00:00 2001
From: rocking5566 <ChunYu.Lai@amd.com>
Date: Mon, 17 Apr 2023 23:12:10 +0800
Subject: [PATCH 21/71] Add (#677)

---
 .../18_groupnorm/groupnorm_swish.cpp          |  4 +--
 .../gpu/normalization_swish.hpp               | 12 +++++++++
 .../gpu/normalization/CMakeLists.txt          |  1 +
 ...oupnorm_swish_f16_f32_f32_f16_instance.cpp | 24 +++++++++++++++++
 .../normalization_instance_common.hpp         | 26 +++++++++++++++++++
 5 files changed, 65 insertions(+), 2 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp

diff --git a/client_example/18_groupnorm/groupnorm_swish.cpp b/client_example/18_groupnorm/groupnorm_swish.cpp
index 8a873e6ac..a79630c23 100644
--- a/client_example/18_groupnorm/groupnorm_swish.cpp
+++ b/client_example/18_groupnorm/groupnorm_swish.cpp
@@ -13,8 +13,8 @@
 #include "ck/library/tensor_operation_instance/gpu/normalization_swish.hpp"
 
 using XDataType       = ck::half_t;
-using GammaDataType   = ck::half_t;
-using BetaDataType    = ck::half_t;
+using GammaDataType   = float;
+using BetaDataType    = float;
 using YDataType       = ck::half_t;
 using ComputeDataType = float;
 using Swish           = ck::tensor_operation::element_wise::Swish;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
index c04a54455..367180dea 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
@@ -25,6 +25,10 @@ void add_device_normalization_rank_5_3_swish_f16_instances(
 void add_device_normalization_rank_5_3_swish_f32_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Swish, 5, 3>>>&);
 
+// [x, gamma, beta, y] = [f16, f32, f32, f16]
+void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F32, F32, F32, F16, Swish, 5, 3>>>&);
+
 template <typename XDataType,
           typename GammaDataType,
           typename BetaDataType,
@@ -70,6 +74,14 @@ struct DeviceOperationInstanceFactory<
                 add_device_normalization_rank_5_3_swish_f32_instances(op_ptrs);
             }
         }
+        else if constexpr(is_same_v<XDataType, F16> && is_same_v<GammaDataType, F32> &&
+                          is_same_v<BetaDataType, F32> && is_same_v<YDataType, F16>)
+        {
+            if constexpr(Rank == 5 && NumReduceDim == 3)
+            {
+                add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances(op_ptrs);
+            }
+        }
 
         return op_ptrs;
     }
diff --git a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
index 6bed36e35..176fb2fbe 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
@@ -7,4 +7,5 @@ add_instance_library(device_normalization_instance
     device_groupnorm_f32_instance.cpp
     device_groupnorm_swish_f16_instance.cpp
     device_groupnorm_swish_f32_instance.cpp
+    device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
new file mode 100644
index 000000000..9f6bf128f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "normalization_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Swish = ck::tensor_operation::element_wise::Swish;
+
+void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F32, F32, F32, F16, Swish, 5, 3>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_normalization_f16_f32_f32_f16_instances<Swish, 5, 3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
index a58fb6ca3..9dea41e89 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
@@ -69,6 +69,32 @@ using device_normalization_f32_instances = std::tuple<
     // clang-format on
     >;
 
+template <typename OutElementwise, index_t Rank, index_t Reduce>
+using device_normalization_f16_f32_f32_f16_instances = std::tuple<
+    // clang-format off
+        // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 1, 1, 1, 1, 1, 1, 1, 1>, // irregular size
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 2, 1, 2, 1, 2, 1, 2, 2>,   // irregular size
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 4, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 8, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 16, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 32, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 4, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 2, 16, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 4, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 8, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 2, 8, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 4, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 8, 1, 4, 1, 4, 1, 4, 4>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
-- 
GitLab


From bb0b772da91632aa26eeda847716801fdc7b4aad Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 18 Apr 2023 09:22:49 -0700
Subject: [PATCH 22/71] Allow using ROCm release candidate compilers. (#679)

* enable use of rocm5.5 release candidate 4

* upgrade to ROCM5.5 RC5

* try fix the PUB_KEY error, remove the cmake-data package

* upgrade to latest cmake version

* use private dockerhub repo for rocm5.5 rc5

* add missing bracket
---
 Dockerfile  | 37 ++++++++++++++++++++++++-------------
 Jenkinsfile | 31 +++++++++++++++++++++----------
 2 files changed, 45 insertions(+), 23 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b03cb836a..cbfd4626c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
 FROM ubuntu:20.04
 
-ARG ROCMVERSION=5.3
+ARG ROCMVERSION=5.4.3
 ARG compiler_version="release"
 ARG compiler_commit=""
 
@@ -8,23 +8,27 @@ RUN set -xe
 
 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
 RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
-RUN useradd -rm -d /home/manitera -s /bin/bash -u 1002 manitera
 # Add rocm repository
 RUN apt-get update
-RUN apt-get install -y wget gnupg
-RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
+RUN apt-get install -y wget gnupg curl
+RUN --mount=type=ssh if [ "$ROCMVERSION" != "5.5"]; then \
+	wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - ; \
+    else sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \
+         apt update && apt-get install -y ./amd-nonfree-radeon_20.04-1_all.deb && \
+         sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.5 rel-50 > /etc/apt/sources.list.d/rocm-build.list' && \
+         amdgpu-repo --amdgpu-build=1558725 && DEBIAN_FRONTEND=noninteractive amdgpu-install -y --usecase=rocm ; \
+    fi
 RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
 RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
+RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
 
 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
     apt-utils \
     build-essential \
     ccache \
-    cmake-data \
     cmake \
-    curl \
     git \
     hip-rocclr \
     jq \
@@ -45,6 +49,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     rocm-device-libs \
     rocm-cmake \
     vim \
+    nano \
     zlib1g-dev \
     openssh-server \
     clang-format-10 \
@@ -52,6 +57,17 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+#Install latest version of cmake
+RUN apt purge --auto-remove -y cmake
+RUN apt update
+RUN apt install -y software-properties-common lsb-release
+RUN apt clean all
+RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
+RUN apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
+RUN apt install -y kitware-archive-keyring
+RUN rm /etc/apt/trusted.gpg.d/kitware.gpg
+RUN apt install -y cmake
+
 # Setup ubsan environment to printstacktrace
 RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
 ENV UBSAN_OPTIONS=print_stacktrace=1
@@ -87,12 +103,7 @@ ENV compiler_commit=$compiler_commit
 RUN sh -c "echo compiler version = '$compiler_version'"
 RUN sh -c "echo compiler commit = '$compiler_commit'"
 
-RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ]; then \
-        sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\    chomp($HIP_CLANG_TARGET);' /opt/rocm/hip/bin/hipcc.pl && \
-        sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\    chomp($HIP_CLANG_TARGET);' /opt/rocm/bin/hipcc.pl; \
-    fi
-
-RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" = "" ]; then \
+RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_version" !=~ ^"rc" ] && [ "$compiler_commit" = "" ]; then \
         git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
         cd llvm-project && mkdir build && cd build && \
         cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
@@ -100,7 +111,7 @@ RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_com
     else echo "using the release compiler"; \
     fi
 
-RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" != "" ]; then \
+RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_version" !=~ ^"rc" ] && [ "$compiler_commit" != "" ]; then \
         git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
         cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
         cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
diff --git a/Jenkinsfile b/Jenkinsfile
index bb0b352d7..19ee17cf9 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -19,12 +19,23 @@ def runShell(String command){
 
 def getDockerImageName(){
     def img
-    if (params.COMPILER_COMMIT == ""){
-        img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+    if (params.ROCMVERSION != "5.5"){
+       if (params.COMPILER_COMMIT == ""){
+           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+       }
+       else{
+           def commit = "${params.COMPILER_COMMIT}"[0..6]
+           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
+       }
     }
     else{
-        def commit = "${params.COMPILER_COMMIT}"[0..6]
-        img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
+       if (params.COMPILER_COMMIT == ""){
+           img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+       }
+       else{
+           def commit = "${params.COMPILER_COMMIT}"[0..6]
+           img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
+       }
     }
     return img
 }
@@ -49,11 +60,11 @@ def build_compiler(){
         compiler = '/opt/rocm/bin/hipcc'
     }
     else{
-        if (params.COMPILER_VERSION == "release"){
-            compiler = "/opt/rocm/llvm/bin/clang++"
+        if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
+            compiler = "/llvm-project/build/bin/clang++"
         }
         else{
-            compiler = "/llvm-project/build/bin/clang++"
+            compiler = "/opt/rocm/llvm/bin/clang++"
         }        
     }
     return compiler
@@ -232,7 +243,7 @@ def buildHipClangJob(Map conf=[:]){
             dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
         }
         def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION != "release"){
+        if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
             dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
         }
 
@@ -287,7 +298,7 @@ def runCKProfiler(Map conf=[:]){
             dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
         }
         def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION != "release"){
+        if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
             dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
         }
 
@@ -420,7 +431,7 @@ def Build_CK(Map conf=[:]){
             dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
         }
         def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION != "release"){
+        if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
             dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
         }
 
-- 
GitLab


From 938a5e0e416b645aea0b0833b7362bafe57b1535 Mon Sep 17 00:00:00 2001
From: Sam Wu <sam.wu2@amd.com>
Date: Thu, 20 Apr 2023 21:55:56 -0600
Subject: [PATCH 23/71] Update dependabot config (#682)

Co-authored-by: samjwu <samjwu@users.noreply.github.com>
---
 .github/dependabot.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index ada22f1b5..9cdf2d670 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -6,7 +6,7 @@
 version: 2
 updates:
   - package-ecosystem: "pip" # See documentation for possible values
-    directory: "/" # Location of package manifests
+    directory: "/docs/.sphinx" # Location of package manifests
     open-pull-requests-limit: 10
     schedule:
       interval: "daily"
-- 
GitLab


From 9afa44d40bc01f7f66621e4b7a4cc3dd749b35a4 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 21 Apr 2023 07:59:26 -0700
Subject: [PATCH 24/71] Switch to the new rocm5.6 compiler. (#681)

* switch to the new rocm5.6 compiler and docker

* fix syntax
---
 Dockerfile  | 18 +++++++++---------
 Jenkinsfile | 42 ++++++++++++++++++++++++++----------------
 2 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index cbfd4626c..8e6ddb1eb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
 FROM ubuntu:20.04
 
-ARG ROCMVERSION=5.4.3
-ARG compiler_version="release"
+ARG ROCMVERSION=5.6
+ARG compiler_version=""
 ARG compiler_commit=""
 
 RUN set -xe
@@ -11,14 +11,14 @@ RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
 # Add rocm repository
 RUN apt-get update
 RUN apt-get install -y wget gnupg curl
-RUN --mount=type=ssh if [ "$ROCMVERSION" != "5.5"]; then \
-	wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - ; \
+RUN --mount=type=ssh if [ "$ROCMVERSION" != "5.6"]; then \
+	wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
+        sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"; \
     else sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \
          apt update && apt-get install -y ./amd-nonfree-radeon_20.04-1_all.deb && \
-         sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.5 rel-50 > /etc/apt/sources.list.d/rocm-build.list' && \
-         amdgpu-repo --amdgpu-build=1558725 && DEBIAN_FRONTEND=noninteractive amdgpu-install -y --usecase=rocm ; \
+         amdgpu-repo --amdgpu-build=1567752 --rocm-build=compute-rocm-dkms-no-npi-hipclang/11914 && \
+         DEBIAN_FRONTEND=noninteractive amdgpu-install -y --usecase=rocm ; \
     fi
-RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
 RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
 RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
@@ -103,7 +103,7 @@ ENV compiler_commit=$compiler_commit
 RUN sh -c "echo compiler version = '$compiler_version'"
 RUN sh -c "echo compiler commit = '$compiler_commit'"
 
-RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_version" !=~ ^"rc" ] && [ "$compiler_commit" = "" ]; then \
+RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ]; then \
         git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
         cd llvm-project && mkdir build && cd build && \
         cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
@@ -111,7 +111,7 @@ RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_ver
     else echo "using the release compiler"; \
     fi
 
-RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_version" !=~ ^"rc" ] && [ "$compiler_commit" != "" ]; then \
+RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ]; then \
         git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
         cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
         cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
diff --git a/Jenkinsfile b/Jenkinsfile
index 19ee17cf9..6cb458031 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -19,22 +19,32 @@ def runShell(String command){
 
 def getDockerImageName(){
     def img
-    if (params.ROCMVERSION != "5.5"){
-       if (params.COMPILER_COMMIT == ""){
-           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+    if (params.ROCMVERSION != "5.5" && params.ROCMVERSION != "5.6"){
+       if (params.COMPILER_VERSION == "") {
+           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
        }
        else{
-           def commit = "${params.COMPILER_COMMIT}"[0..6]
-           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
+          if (params.COMPILER_COMMIT == ""){
+             img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+          }
+          else{
+             def commit = "${params.COMPILER_COMMIT}"[0..6]
+             img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
+          }
        }
     }
     else{
-       if (params.COMPILER_COMMIT == ""){
-           img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+       if (params.COMPILER_VERSION == "") {
+           img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}"
        }
        else{
-           def commit = "${params.COMPILER_COMMIT}"[0..6]
-           img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
+          if (params.COMPILER_COMMIT == ""){
+             img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+          }
+          else{
+             def commit = "${params.COMPILER_COMMIT}"[0..6]
+             img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
+          }
        }
     }
     return img
@@ -587,7 +597,7 @@ def process_results(Map conf=[:]){
 
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true
-                                              0 21 * * * % COMPILER_VERSION=release;COMPILER_COMMIT=
+                                              0 21 * * * % ROCMVERSION=5.4.3;COMPILER_VERSION=release;COMPILER_COMMIT=
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""
 
 pipeline {
@@ -605,16 +615,16 @@ pipeline {
             description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
         string(
             name: 'ROCMVERSION', 
-            defaultValue: '5.4.3', 
-            description: 'Specify which ROCM version to use: 5.4.3 (default).')
+            defaultValue: '5.6', 
+            description: 'Specify which ROCM version to use: 5.6 (default).')
         string(
             name: 'COMPILER_VERSION', 
-            defaultValue: 'amd-stg-open', 
-            description: 'Specify which version of compiler to use: ck-9110, release, or amd-stg-open (default).')
+            defaultValue: '', 
+            description: 'Specify which version of compiler to use: release, amd-stg-open, or leave blank (default).')
         string(
             name: 'COMPILER_COMMIT', 
-            defaultValue: '5541927df00eabd6a110180170eca7785d436ee3', 
-            description: 'Specify which commit of compiler branch to use: leave empty to use the latest commit, or use 5541927df00eabd6a110180170eca7785d436ee3 (default) commit of amd-stg-open branch.')
+            defaultValue: '', 
+            description: 'Specify which commit of compiler branch to use: leave blank to use the latest commit, or use 5541927df00eabd6a110180170eca7785d436ee3 (default) commit of amd-stg-open branch.')
         string(
             name: 'BUILD_COMPILER', 
             defaultValue: 'hipcc', 
-- 
GitLab


From 903cd19ce31c27edb7de49d5c77c09c397813de7 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 21 Apr 2023 17:37:00 -0700
Subject: [PATCH 25/71] Put back the split-k gemm code. (#684)

* simplify karg in device/grid split-k op

* fix mk_kn_mn instances

* add more instances

* use name from tensor layout

---------

Co-authored-by: carlushuang <carlus.huang@amd.com>
---
 .../impl/device_gemm_xdl_splitk_c_shuffle.hpp | 466 ++---------------
 .../gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp  | 484 +++++++++++++-----
 ...l_splitk_f16_f16_f16_mk_kn_mn_instance.cpp |  27 +-
 3 files changed, 438 insertions(+), 539 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
index 0d2aeaeb7..1f08cec67 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -73,157 +73,18 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
     static constexpr auto I2 = Number<2>{};
     static constexpr auto I3 = Number<3>{};
 
-    static constexpr auto K1Number = Number<K1>{};
-
-    static auto
-    MakeAGridDescriptor_KBatch_K0_M_K1(index_t M, index_t K, index_t StrideA, int KBatch, int KPad)
-    {
-        assert(KPad % (K1 * KBatch) == 0);
-
-        const index_t K0 = KPad / (K1 * KBatch);
-
-        const auto a_grid_desc_m_k = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
-            }
-        }();
-
-        const auto a_grid_desc_m_kpad = transform_tensor_descriptor(
-            a_grid_desc_m_k,
-            make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-            return transform_tensor_descriptor(
-                a_grid_desc_m_kpad,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
-                           make_right_pad_transform(M, PadM)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                a_grid_desc_m_kpad,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
-                           make_pass_through_transform(M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-    }
-
-    static auto
-    MakeBGridDescriptor_KBatch_K0_N_K1(index_t K, index_t N, index_t StrideB, int KBatch, int KPad)
-    {
-        assert(KPad % (K1 * KBatch) == 0);
-
-        const index_t K0 = KPad / (K1 * KBatch);
-
-        const auto b_grid_desc_k_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
-            }
-        }();
-
-        const auto b_grid_desc_kpad_n = transform_tensor_descriptor(
-            b_grid_desc_k_n,
-            make_tuple(make_right_pad_transform(K, KPad - K), make_pass_through_transform(N)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-            return transform_tensor_descriptor(
-                b_grid_desc_kpad_n,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
-                           make_right_pad_transform(N, PadN)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                b_grid_desc_kpad_n,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
-                           make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-    }
-
-    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
-    {
-        const auto c_grid_desc_m_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
-            }
-        }();
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-    }
-
-    static auto GetKPad(index_t K, index_t KBatch)
-    {
-        const index_t K0   = math::integer_divide_ceil(K, K1 * K0PerBlock * KBatch) * K0PerBlock;
-        const index_t KPad = KBatch * K0 * K1;
-        return KPad;
-    }
-
-    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_KBatch_K0_M_K1(1, 1, 1, 1, 1));
-    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_KBatch_K0_N_K1(1, 1, 1, 1, 1));
-    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
-
-    // GridwiseGemm
     using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
         BlockSize,
         ADataType, // TODO: distinguish A/B datatype
         AccDataType,
         CDataType,
-        InMemoryDataOperationEnum::Set,
-        AGridDesc_K0_M_K1,
-        BGridDesc_K0_N_K1,
-        CGridDesc_M_N,
+        ALayout,
+        BLayout,
+        CLayout,
         AElementwiseOperation,
         BElementwiseOperation,
         CElementwiseOperation,
+        GemmSpec,
         MPerBlock,
         NPerBlock,
         K0PerBlock,
@@ -253,236 +114,64 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
         CBlockTransferScalarPerVector_NWaveNPerXDL,
         CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
 
-    // GridwiseGemm
-    using GridwiseGemmAtomicAdd = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
-        BlockSize,
-        ADataType, // TODO: distinguish A/B datatype
-        AccDataType,
-        CDataType,
-        InMemoryDataOperationEnum::AtomicAdd,
-        AGridDesc_K0_M_K1,
-        BGridDesc_K0_N_K1,
-        CGridDesc_M_N,
-        AElementwiseOperation,
-        BElementwiseOperation,
-        CElementwiseOperation,
-        MPerBlock,
-        NPerBlock,
-        K0PerBlock,
-        MPerXDL,
-        NPerXDL,
-        K1,
-        MXdlPerWave,
-        NXdlPerWave,
-        ABlockTransferThreadClusterLengths_K0_M_K1,
-        ABlockTransferThreadClusterArrangeOrder,
-        ABlockTransferSrcAccessOrder,
-        ABlockTransferSrcVectorDim,
-        ABlockTransferSrcScalarPerVector,
-        ABlockTransferDstScalarPerVector_K1,
-        false, // AThreadTransferSrcResetCoordinateAfterRun,
-        ABlockLdsAddExtraM,
-        BBlockTransferThreadClusterLengths_K0_N_K1,
-        BBlockTransferThreadClusterArrangeOrder,
-        BBlockTransferSrcAccessOrder,
-        BBlockTransferSrcVectorDim,
-        BBlockTransferSrcScalarPerVector,
-        BBlockTransferDstScalarPerVector_K1,
-        false, // BThreadTransferSrcResetCoordinateAfterRun,
-        BBlockLdsAddExtraN,
-        CShuffleMRepeatPerShuffle,
-        CShuffleNRepeatPerShuffle,
-        CBlockTransferScalarPerVector_NWaveNPerXDL,
-        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
-
-    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}));
-
-    using Block2CTileMap = typename GridwiseGemm::CBlockClusterAdaptor;
-
-    // Argument
-    struct Argument : public BaseArgument
-    {
-        Argument(const ADataType* p_a_grid,
-                 const BDataType* p_b_grid,
-                 CDataType* p_c_grid,
-                 index_t M,
-                 index_t N,
-                 index_t K,
-                 index_t StrideA,
-                 index_t StrideB,
-                 index_t StrideC,
-                 index_t M01,
-                 index_t N01,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op,
-                 index_t k_batch)
-            : p_a_grid_{p_a_grid},
-              p_b_grid_{p_b_grid},
-              p_c_grid_{p_c_grid},
-              a_grid_desc_kbatch_k0_m_k1_{},
-              b_grid_desc_kbatch_k0_n_k1_{},
-              c_grid_desc_m_n_{},
-              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_ctile_map_{},
-              M01_{M01},
-              N01_{N01},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op},
-              k_batch_{k_batch}
-        {
-            int KPad = DeviceGemmXdlSplitKCShuffle::GetKPad(K, k_batch_);
-
-            a_grid_desc_kbatch_k0_m_k1_ =
-                DeviceGemmXdlSplitKCShuffle::MakeAGridDescriptor_KBatch_K0_M_K1(
-                    M, K, StrideA, k_batch_, KPad);
-            b_grid_desc_kbatch_k0_n_k1_ =
-                DeviceGemmXdlSplitKCShuffle::MakeBGridDescriptor_KBatch_K0_N_K1(
-                    K, N, StrideB, k_batch_, KPad);
-            c_grid_desc_m_n_ = DeviceGemmXdlSplitKCShuffle::MakeCGridDescriptor_M_N(M, N, StrideC);
-
-            block_2_ctile_map_ =
-                GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n_, M01, N01, k_batch_);
-
-            if(GridwiseGemm::CheckValidity(a_grid_desc_kbatch_k0_m_k1_,
-                                           b_grid_desc_kbatch_k0_n_k1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n_);
-            }
-        }
-
-        //  private:
-        const ADataType* p_a_grid_;
-        const BDataType* p_b_grid_;
-        CDataType* p_c_grid_;
-        AGridDesc_K0_M_K1 a_grid_desc_kbatch_k0_m_k1_;
-        BGridDesc_K0_N_K1 b_grid_desc_kbatch_k0_n_k1_;
-        CGridDesc_M_N c_grid_desc_m_n_;
-        CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock_;
-        Block2CTileMap block_2_ctile_map_;
-        index_t M01_;
-        index_t N01_;
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
-        index_t k_batch_;
-    };
+    using Argument = typename GridwiseGemm::Argument;
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        using Argument = DeviceGemmXdlSplitKCShuffle::Argument;
 
-        void Print(const Argument& arg)
-        {
-            std::cout << "arg.a_grid_desc_kbatch_k0_m_k1_{"
-                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0) << ", "
-                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1) << ", "
-                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I2) << ", "
-                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I3) << "}" << std::endl;
-
-            std::cout << "arg.b_grid_desc_kbatch_k0_n_k1_{"
-                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I0) << ", "
-                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I1) << ", "
-                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I2) << ", "
-                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I3) << "}" << std::endl;
-
-            std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
-                      << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
-        }
+        void Print(const Argument& karg) { karg.Print(); }
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        float Run(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
-                Print(arg);
+                Print(karg);
             }
 
-            const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
+            const auto kbatch = karg.k_batch;
 
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
-                                            arg.b_grid_desc_kbatch_k0_n_k1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
+            if(!GridwiseGemm::CheckValidity(karg))
             {
                 throw std::runtime_error(
-                    "wrong! GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 has invalid setting");
+                    "wrong! GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 has invalid "
+                    "setting");
             }
 
-            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
-
-            const auto K0 = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(karg);
+            const auto K0           = karg.K0;
 
             const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
 
             float ave_time = 0;
 
             const auto Run = [&](const auto& kernel) {
-                hipGetErrorString(hipMemset(
-                    arg.p_c_grid_,
-                    0,
-                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
-                        sizeof(CDataType)));
-
-                ave_time =
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_,
-                                           arg.p_b_grid_,
-                                           arg.p_c_grid_,
-                                           arg.a_grid_desc_kbatch_k0_m_k1_,
-                                           arg.b_grid_desc_kbatch_k0_n_k1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.block_2_ctile_map_);
+                if(kbatch > 1)
+                    hipGetErrorString(
+                        hipMemset(karg.p_c_grid, 0, karg.M * karg.N * sizeof(CDataType)));
+
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg);
             };
 
             if(has_main_k0_block_loop)
             {
                 if(kbatch == 1)
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
-                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                        AElementwiseOperation,
-                        BElementwiseOperation,
-                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
-                        true>;
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
+                                                             true,
+                                                             InMemoryDataOperationEnum::Set>;
 
                     Run(kernel);
                 }
                 else
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
-                        GridwiseGemmAtomicAdd,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
-                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                        AElementwiseOperation,
-                        BElementwiseOperation,
-                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
-                        true>;
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
+                                                             true,
+                                                             InMemoryDataOperationEnum::AtomicAdd>;
 
                     Run(kernel);
                 }
@@ -491,37 +180,19 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
             {
                 if(kbatch == 1)
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
-                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                        AElementwiseOperation,
-                        BElementwiseOperation,
-                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
-                        false>;
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
+                                                             false,
+                                                             InMemoryDataOperationEnum::Set>;
 
                     Run(kernel);
                 }
                 else
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
-                        GridwiseGemmAtomicAdd,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
-                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                        AElementwiseOperation,
-                        BElementwiseOperation,
-                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
-                        false>;
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
+                                                             false,
+                                                             InMemoryDataOperationEnum::AtomicAdd>;
 
                     Run(kernel);
                 }
@@ -544,12 +215,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
         return true;
     }
 
-    static bool IsSupportedArgument(const Argument& arg)
+    static bool IsSupportedArgument(const Argument& karg)
     {
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
-                                           arg.b_grid_desc_kbatch_k0_n_k1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        return GridwiseGemm::CheckValidity(karg);
     }
 
     // polymorphic
@@ -567,9 +235,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                              index_t StrideA,
                              index_t StrideB,
                              index_t StrideC,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op,
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CElementwiseOperation,
                              index_t KBatch)
     {
         return Argument{p_a,
@@ -581,11 +249,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                         StrideA,
                         StrideB,
                         StrideC,
-                        1,
-                        1,
-                        a_element_op,
-                        b_element_op,
-                        c_element_op,
+                        GridwiseGemm::CalculateMPadded(M),
+                        GridwiseGemm::CalculateNPadded(N),
+                        GridwiseGemm::CalculateKPadded(K),
+                        GridwiseGemm::CalculateK0(K, KBatch),
                         KBatch};
     }
 
@@ -601,9 +268,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                                       index_t StrideA,
                                                       index_t StrideB,
                                                       index_t StrideC,
-                                                      AElementwiseOperation a_element_op,
-                                                      BElementwiseOperation b_element_op,
-                                                      CElementwiseOperation c_element_op,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation,
                                                       ck::index_t KBatch = 1) override
     {
         return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
@@ -615,11 +282,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                           StrideA,
                                           StrideB,
                                           StrideC,
-                                          1,
-                                          1,
-                                          a_element_op,
-                                          b_element_op,
-                                          c_element_op,
+                                          GridwiseGemm::CalculateMPadded(M),
+                                          GridwiseGemm::CalculateNPadded(N),
+                                          GridwiseGemm::CalculateKPadded(K),
+                                          GridwiseGemm::CalculateK0(K, KBatch),
                                           KBatch);
     }
 
@@ -630,31 +296,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
     }
 
     // polymorphic
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "DeviceGemmXdlSplitKCShuffle"
-            << "<"
-            << BlockSize << ", "
-            << MPerBlock << ", "
-            << NPerBlock << ", "
-            << K0PerBlock << ", "
-            << K1 << ", "
-            << MPerXDL << ", "
-            << NPerXDL << ", "
-            << MXdlPerWave << ", "
-            << NXdlPerWave << ", "
-            << ABlockTransferSrcScalarPerVector << ", "
-            << ABlockTransferDstScalarPerVector_K1 << ", "
-            << BBlockTransferSrcScalarPerVector << ", "
-            << BBlockTransferDstScalarPerVector_K1
-            << ">";
-        // clang-format on
-
-        return str.str();
-    }
+    std::string GetTypeString() const override { return GridwiseGemm::GetTypeString(); }
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
index 190194f1e..727f180e9 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -18,60 +18,23 @@
 namespace ck {
 
 template <typename GridwiseGemm,
-          typename FloatAB,
-          typename FloatC,
-          typename AGridDesc_B_K0_M_K1,
-          typename BGridDesc_B_K0_N_K1,
-          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation,
-          typename CBlockClusterAdaptor,
-          bool HasMainKBlockLoop>
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_v2r4r2(const FloatAB* __restrict__ p_a_grid,
-                                  const FloatAB* __restrict__ p_b_grid,
-                                  FloatC* __restrict__ p_c_grid,
-                                  const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
-                                  const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
-                                  const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                  const AElementwiseOperation a_element_op,
-                                  const BElementwiseOperation b_element_op,
-                                  const CElementwiseOperation c_element_op,
-                                  const CBlockClusterAdaptor c_block_cluster_adaptor)
+        kernel_gemm_xdlops_v2r4r2_simplified(typename GridwiseGemm::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_c_grid,
-                                                  static_cast<void*>(p_shared_block),
-                                                  a_b_k0_m_k1_grid_desc,
-                                                  b_b_k0_n_k1_grid_desc,
-                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  c_block_cluster_adaptor);
+    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
+
+    __shared__ uint8_t p_shared[shared_size];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
+        karg, static_cast<void*>(p_shared));
 #else
-    ignore = p_a_grid;
-    ignore = p_b_grid;
-    ignore = p_c_grid;
-    ignore = a_b_k0_m_k1_grid_desc;
-    ignore = b_b_k0_n_k1_grid_desc;
-    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore = a_element_op;
-    ignore = b_element_op;
-    ignore = c_element_op;
-    ignore = c_block_cluster_adaptor;
+    ignore = karg;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 
@@ -79,13 +42,13 @@ template <index_t BlockSize,
           typename FloatAB,
           typename FloatAcc,
           typename FloatC,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AGridDesc_B_K0_M_K1,
-          typename BGridDesc_B_K0_N_K1,
-          typename CMNGridDesc,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
           index_t MPerBlock,
           index_t NPerBlock,
           index_t K0PerBlock,
@@ -126,10 +89,238 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
     static constexpr auto I7 = Number<7>{};
 
     // K1 should be Number<...>
-    static constexpr auto K1 = Number<K1Value>{};
+    static constexpr auto K1  = Number<K1Value>{};
+    static constexpr auto M01 = 1;
+    static constexpr auto N01 = 1;
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        const FloatAB* p_a_grid;
+        const FloatAB* p_b_grid;
+        FloatC* p_c_grid;
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        index_t StrideC;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KPadded;
+        index_t K0;
+        index_t k_batch;
+
+        Argument(const FloatAB* p_a_grid_,
+                 const FloatAB* p_b_grid_,
+                 FloatC* p_c_grid_,
+                 index_t M_,
+                 index_t N_,
+                 index_t K_,
+                 index_t StrideA_,
+                 index_t StrideB_,
+                 index_t StrideC_,
+                 index_t MPadded_,
+                 index_t NPadded_,
+                 index_t KPadded_,
+                 index_t K0_,
+                 index_t k_batch_)
+            : p_a_grid(p_a_grid_),
+              p_b_grid(p_b_grid_),
+              p_c_grid(p_c_grid_),
+              M(M_),
+              N(N_),
+              K(K_),
+              StrideA(StrideA_),
+              StrideB(StrideB_),
+              StrideC(StrideC_),
+              MPadded(MPadded_),
+              NPadded(NPadded_),
+              KPadded(KPadded_),
+              K0(K0_),
+              k_batch(k_batch_)
+        {
+        }
+
+        void Print() const
+        {
+            std::cout << "arg {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KP:" << KPadded << ", "
+                      << "K0:" << K0 << ", "
+                      << "KB:" << k_batch << "}" << std::endl;
+        }
+    };
+
+    __host__ __device__ static auto CalculateGridSize(const Argument& karg)
+    {
+        return std::make_tuple(math::integer_divide_ceil(karg.N, NPerBlock),
+                               math::integer_divide_ceil(karg.M, MPerBlock),
+                               karg.k_batch);
+    }
+
+    // prefer this to be called on host
+    __host__ __device__ static auto CalculateMPadded(index_t M)
+    {
+        return (M + MPerBlock - 1) / MPerBlock * MPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateNPadded(index_t N)
+    {
+        return (N + NPerBlock - 1) / NPerBlock * NPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateK0(index_t K, index_t K_Batch = 1)
+    {
+        // k_batch * k0 * k0_per_block * k1
+        auto K_t = K_Batch * K0PerBlock * K1;
+        return (K + K_t - 1) / K_t * K0PerBlock;
+    }
+
+    __host__ __device__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K0 = CalculateK0(K, K_Batch);
+        return K_Batch * K0 * K1;
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_KBatch_K0_M_K1(index_t M,
+                                                                       index_t MPad,
+                                                                       index_t K,
+                                                                       index_t StrideA,
+                                                                       index_t KBatch,
+                                                                       index_t K0,
+                                                                       index_t KPad)
+    {
+        const auto a_grid_desc_m_k = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto a_grid_desc_m_kpad = transform_tensor_descriptor(
+            a_grid_desc_m_k,
+            make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)
+        {
+            // const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+            return transform_tensor_descriptor(
+                a_grid_desc_m_kpad,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                a_grid_desc_m_kpad,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_KBatch_K0_N_K1(index_t K,
+                                                                       index_t NPad,
+                                                                       index_t N,
+                                                                       index_t StrideB,
+                                                                       index_t KBatch,
+                                                                       index_t K0,
+                                                                       index_t KPad)
+    {
+        const auto b_grid_desc_k_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
+            }
+        }();
+
+        const auto b_grid_desc_kpad_n = transform_tensor_descriptor(
+            b_grid_desc_k_n,
+            make_tuple(make_right_pad_transform(K, KPad - K), make_pass_through_transform(N)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)
+        {
+            // const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+            return transform_tensor_descriptor(
+                b_grid_desc_kpad_n,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                b_grid_desc_kpad_n,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+    }
+
+    __host__ __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t N, index_t MPad, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
+        {
+            return transform_tensor_descriptor(c_grid_desc_m_n,
+                                               make_tuple(make_right_pad_transform(M, MPad - M),
+                                                          make_right_pad_transform(N, NPad - N)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
     __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
     {
         constexpr auto max_lds_align = K1;
@@ -178,45 +369,68 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                          c_block_size * sizeof(FloatC));
     }
 
-    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
-    template <typename Block2CTileMap>
-    __host__ __device__ static constexpr bool
-    CheckValidity(const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
-                  const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
-                  const CMNGridDesc& c_m_n_grid_desc,
-                  const Block2CTileMap& block_2_ctile_map)
+    __host__ __device__ static constexpr bool CheckValidity(const Argument& karg)
     {
-        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
-                      "wrong! K1 need to be known at compile-time");
-
-        static_assert((MPerBlock % (MPerXDL * MRepeat) == 0) &&
-                          (NPerBlock % (NRepeat * NPerXDL)) == 0,
-                      "Invalid tuning param!");
-
-        const auto M      = a_b_k0_m_k1_grid_desc.GetLength(I2);
-        const auto N      = b_b_k0_n_k1_grid_desc.GetLength(I2);
-        const auto K0     = a_b_k0_m_k1_grid_desc.GetLength(I1);
-        const auto KBatch = a_b_k0_m_k1_grid_desc.GetLength(I0);
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(karg.M % MPerBlock == 0))
+                return false;
+        }
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(karg.N % NPerBlock == 0))
+                return false;
+        }
 
-        if(!(M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
-             K0 == b_b_k0_n_k1_grid_desc.GetLength(I1) &&
-             K1 == a_b_k0_m_k1_grid_desc.GetLength(I3) &&
-             K1 == b_b_k0_n_k1_grid_desc.GetLength(I3) &&
-             KBatch == b_b_k0_n_k1_grid_desc.GetLength(I0)))
-            return false;
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+                return false;
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+                return false;
+        }
 
-        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0))
-            return false;
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+                return false;
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+                return false;
+        }
 
-        if(!block_2_ctile_map.CheckValidity(c_m_n_grid_desc))
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
         {
-            return false;
+            if(karg.N % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
+                return false;
+        }
+        else
+        {
+            if(karg.M % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
+                return false;
         }
 
-        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
         return true;
     }
 
+    __host__ __device__ static auto GetKPad(index_t K, index_t KBatch)
+    {
+        const index_t K0   = math::integer_divide_ceil(K, K1 * K0PerBlock * KBatch) * K0PerBlock;
+        const index_t KPad = KBatch * K0 * K1;
+        return KPad;
+    }
+
     __host__ __device__ static constexpr bool CalculateHasMainK0BlockLoop(index_t K0)
     {
         const bool has_main_k0_block_loop = K0 > K0PerBlock;
@@ -224,8 +438,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         return has_main_k0_block_loop;
     }
 
+    template <typename CGridDesc>
     __host__ __device__ static constexpr auto
-    MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(const CMNGridDesc& c_m_n_grid_desc)
+    MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc& c_m_n_grid_desc)
     {
         const auto M = c_m_n_grid_desc.GetLength(I0);
         const auto N = c_m_n_grid_desc.GetLength(I1);
@@ -242,10 +457,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
     }
 
     // return block_id to C matrix tile idx (m0, n0) mapping
+    template <typename CGridDesc>
     __host__ __device__ static constexpr auto MakeCBlockClusterAdaptor(
-        const CMNGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
+        const CGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
     {
-        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CMNGridDesc>(
+        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc>(
             c_m_n_grid_desc, 8, KBatch);
     }
 
@@ -262,24 +478,25 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                        Number<CShuffleNRepeatPerShuffle * NWave * NPerXDL>{}));
     }
 
-    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CMNGridDesc{}));
-    using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1, 1));
-
-    template <bool HasMainKBlockLoop>
-    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
-                               const FloatAB* __restrict__ p_b_grid,
-                               FloatC* __restrict__ p_c_grid,
-                               void* __restrict__ p_shared_block,
-                               const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
-                               const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
-                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
-                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
-                               const AElementwiseOperation& a_element_op,
-                               const BElementwiseOperation& b_element_op,
-                               const CElementwiseOperation& c_element_op,
-                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
+    template <bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation>
+    __device__ static void Run(const Argument& karg, void* __restrict__ p_shared_block)
     {
+        const FloatAB* p_a_grid          = karg.p_a_grid;
+        const FloatAB* p_b_grid          = karg.p_b_grid;
+        FloatC* p_c_grid                 = karg.p_c_grid;
+        const auto a_b_k0_m_k1_grid_desc = MakeAGridDescriptor_KBatch_K0_M_K1(
+            karg.M, karg.MPadded, karg.K, karg.StrideA, karg.k_batch, karg.K0, karg.KPadded);
+        const auto b_b_k0_n_k1_grid_desc = MakeBGridDescriptor_KBatch_K0_N_K1(
+            karg.K, karg.NPadded, karg.N, karg.StrideB, karg.k_batch, karg.K0, karg.KPadded);
+        const auto c_grid_desc_m_n =
+            MakeCGridDescriptor_M_N(karg.M, karg.N, karg.MPadded, karg.NPadded, karg.StrideC);
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n);
+        const AElementwiseOperation a_element_op = AElementwiseOperation{};
+        const BElementwiseOperation b_element_op = BElementwiseOperation{};
+        const CElementwiseOperation c_element_op = CElementwiseOperation{};
+
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -289,26 +506,16 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
 
         const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1);
 
-        // divide block work by [M, N]
-        const auto block_work_idx =
-            c_block_cluster_adaptor.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
-
-        if(!c_block_cluster_adaptor.ValidCTileIndex(
-               make_tuple(block_work_idx[I1], block_work_idx[I2]),
-               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
-                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
-        {
-            return;
-        }
-
-        const index_t k_batch_id = block_work_idx[I0];
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(blockIdx.y);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const index_t k_batch_id = __builtin_amdgcn_readfirstlane(blockIdx.z);
 
         // HACK: this force m/n_block_data_idx_on_grid into SGPR
         const index_t m_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * MPerBlock);
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
 
         const index_t n_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_work_idx[I2] * NPerBlock);
+            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
 
         // lds max alignment
         constexpr auto max_lds_align = K1;
@@ -444,7 +651,6 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
         //       register
         // sanity check
-
         auto blockwise_gemm =
             BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
                                                                 FloatAB,
@@ -647,7 +853,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                 {c_block_desc_mblock_mperblock_nblock_nperblock,
                  make_multi_index(0, 0, 0, 0),
                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                 make_multi_index(block_work_idx[I1], 0, block_work_idx[I2], 0),
+                 make_multi_index(block_m_id, 0, block_n_id, 0),
                  c_element_op};
 
             constexpr auto mxdlperwave_forward_step =
@@ -716,6 +922,48 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
             });
         }
     }
+
+    template <typename Layout>
+    struct LStr
+    {
+        static std::string Get() { return ""; }
+    };
+
+    template <>
+    struct LStr<ck::tensor_layout::gemm::RowMajor>
+    {
+        static std::string Get() { return "R"; }
+    };
+
+    template <>
+    struct LStr<ck::tensor_layout::gemm::ColumnMajor>
+    {
+        static std::string Get() { return "C"; }
+    };
+
+    static std::string GetTypeString()
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "GemmXdlSplitKCShuffle_"
+            << getGemmSpecializationString(GemmSpec) << "_"
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << "_"
+            << "B" << BlockSize << "_"
+            << "Vec" << ABlockTransferSrcScalarPerVector << "x"
+            << BBlockTransferSrcScalarPerVector << "x"
+            << CBlockTransferScalarPerVector_NWaveNPerXDL << "_"
+            << MPerBlock << "x"
+            << NPerBlock << "x"
+            << K0PerBlock << "x"
+            << K1 ;
+        // clang-format on
+
+        return str.str();
+    }
 };
 
 } // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
index 9b5ff4048..c4680db83 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -26,7 +26,8 @@ using S = ck::Sequence<Is...>;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
 
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
 using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple<
@@ -35,14 +36,22 @@ using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple<
         //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,    64,   192,     4,  8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   192,    64,     4,  8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    32,   192,     4,  8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   192,    32,     4,  8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    32,    64,     4,  8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    64,    32,     4,  8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>
     // clang-format on
     >;
 
-- 
GitLab


From 3eecbfb6ec231cd8012faceb8b6fbc87199db60d Mon Sep 17 00:00:00 2001
From: rocking <ChunYu.Lai@amd.com>
Date: Mon, 24 Apr 2023 12:40:00 +0800
Subject: [PATCH 26/71] Revise layout of group convolution (#675)

* [What] Remove pure conv int8 instance
[Why] We will never use pure int8 conv in AI, use int8 quantization instead

* Change layout

* Share the kernel parameter

* Support more type of NHWGC for group conv

* Revise client example of conv 2d, use NHWGC layout

* Add instance to cmake

* Revise layout of group conv quantization instance

* Revise layout of external api of group conv quantization

* Revise layout of group conv quantization client example

* Fix clang format

* Add comment to describe meaning of each parameter
---
 .../grouped_conv2d_fwd.cpp                    |  74 +++-----
 ..._fwd_bias_relu_perchannel_quantization.cpp |  42 +++--
 ...2d_fwd_bias_relu_perlayer_quantization.cpp |  26 +--
 ..._fwd_bias_tanh_perchannel_quantization.cpp |  28 +--
 ...2d_fwd_bias_tanh_perlayer_quantization.cpp |  26 +--
 .../conv2d_fwd_perchannel_quantization.cpp    |  40 +++--
 .../conv2d_fwd_perlayer_quantization.cpp      |  24 +--
 ...d_bias_perchannel_quantization_example.inc |   6 +-
 ...fwd_bias_perlayer_quantization_example.inc |   6 +-
 ...2d_fwd_perchannel_quantization_example.inc |   6 +-
 ...nv2d_fwd_perlayer_quantization_example.inc |   6 +-
 .../gpu/grouped_convolution_forward.hpp       |  58 +++----
 ...n_bias_forward_perchannel_quantization.hpp |  34 ++--
 ...ion_bias_forward_perlayer_quantization.hpp |  34 ++--
 ...lution_forward_perchannel_quantization.hpp |  22 +--
 ...volution_forward_perlayer_quantization.hpp |  22 +--
 .../gpu/grouped_conv2d_fwd/CMakeLists.txt     |   6 +-
 .../device_grouped_conv2d_fwd_common.hpp      |  53 ++++++
 ..._fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp | 112 ++++--------
 ..._fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp | 116 ++++---------
 ...fwd_dl_gnhwc_gkyxc_gnhwk_int8_instance.cpp | 104 -----------
 .../device_grouped_conv2d_fwd_dl_instance.hpp |  50 ++++++
 ...wd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp | 162 ++++--------------
 ...fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp | 160 ++++-------------
 ...fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp | 132 ++++----------
 ...wd_xdl_gnhwc_gkyxc_gnhwk_int8_instance.cpp | 125 --------------
 ...device_grouped_conv2d_fwd_xdl_instance.hpp | 105 ++++++++++++
 ...wd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |  66 +++++++
 ...fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp | 160 ++++-------------
 ...fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp |  66 +++++++
 .../conv2d_fwd/conv2d_quantization_common.hpp |   4 +-
 ..._perchannel_quantization_int8_instance.cpp |  57 ++++--
 ...as_perlayer_quantization_int8_instance.cpp |  57 ++++--
 .../device_conv2d_dl_int8_instance.hpp        |   7 +-
 ..._perchannel_quantization_int8_instance.cpp |  38 ++--
 ...dl_perlayer_quantization_int8_instance.cpp |  38 ++--
 ..._perchannel_quantization_int8_instance.cpp |  57 ++++--
 ...as_perlayer_quantization_int8_instance.cpp |  57 ++++--
 .../device_conv2d_xdl_int8_instance.hpp       |  39 +++--
 ..._perchannel_quantization_int8_instance.cpp |  38 ++--
 ...dl_perlayer_quantization_int8_instance.cpp |  38 ++--
 41 files changed, 1079 insertions(+), 1222 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_common.hpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_instance.hpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_instance.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp

diff --git a/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp b/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
index ece6e30c5..0a798be27 100644
--- a/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
+++ b/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
@@ -17,22 +17,22 @@ using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
 using OutDataType = ck::half_t;
 
-using InLayout    = ck::tensor_layout::convolution::GNHWC;
+using InLayout    = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout   = ck::tensor_layout::convolution::GKYXC;
-using OutLayout   = ck::tensor_layout::convolution::GNHWK;
+using OutLayout   = ck::tensor_layout::convolution::NHWGK;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 static constexpr ck::index_t NumDimSpatial = 2;
 static constexpr ck::index_t G             = 32;
-static constexpr ck::index_t N             = 256;
-static constexpr ck::index_t K             = 192;
-static constexpr ck::index_t C             = 192;
-static constexpr ck::index_t Y             = 3;
-static constexpr ck::index_t X             = 3;
-static constexpr ck::index_t Hi            = 28;
-static constexpr ck::index_t Wi            = 28;
-static constexpr ck::index_t Ho            = 28;
-static constexpr ck::index_t Wo            = 28;
+static constexpr ck::index_t N             = 256; // batch size
+static constexpr ck::index_t K             = 64;  // output channel
+static constexpr ck::index_t C             = 32;  // input channel (per group)
+static constexpr ck::index_t Y             = 3;   // filter H
+static constexpr ck::index_t X             = 3;   // filter W
+static constexpr ck::index_t Hi            = 28;  // input H
+static constexpr ck::index_t Wi            = 28;  // input W
+static constexpr ck::index_t Ho            = 28;  // output H
+static constexpr ck::index_t Wo            = 28;  // output W
 
 struct SimpleDeviceMem
 {
@@ -52,50 +52,24 @@ struct SimpleDeviceMem
 
 int main()
 {
-    std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
-    std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1};
-
-    std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
-    std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1};
-
-    std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
-    std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1};
-
-    std::partial_sum(rbegin(in_lengths),
-                     std::prev(rend(in_lengths)),
-                     std::next(rbegin(in_strides)),
-                     std::multiplies<>{});
-    std::partial_sum(rbegin(wei_lengths),
-                     std::prev(rend(wei_lengths)),
-                     std::next(rbegin(wei_strides)),
-                     std::multiplies<>{});
-    std::partial_sum(rbegin(out_lengths),
-                     std::prev(rend(out_lengths)),
-                     std::next(rbegin(out_strides)),
-                     std::multiplies<>{});
-
-    // transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
-    std::rotate(
-        rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
-    std::rotate(
-        rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
-    std::rotate(
-        rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
-    std::rotate(
-        rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
-    std::rotate(
-        rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
-    std::rotate(
-        rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
+    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
+    std::array<ck::index_t, 5> wei_lengths{G, K, C, Y, X};
+    std::array<ck::index_t, 5> wei_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
 
     std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
     std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
     std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
     std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
 
-    SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
     SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
 
     using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
                                                                                  InLayout,
@@ -155,9 +129,9 @@ int main()
             float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
 
             std::size_t flop      = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
-            std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
+            std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
                                     sizeof(WeiDataType) * G * K * Y * X * C +
-                                    sizeof(OutDataType) * G * N * Ho * Wo * K;
+                                    sizeof(OutDataType) * N * Ho * Wo * G * K;
 
             float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
             float gb_per_sec = num_bytes / 1.E6 / avg_time;
diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
index a10dd3e00..43a4779f5 100644
--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
@@ -17,26 +17,26 @@ using BiasDataType         = int32_t;
 using RequantScaleDataType = float;
 using OutDataType          = int8_t;
 
-using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using InLayout           = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout         = ck::tensor_layout::convolution::G_K;
 using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp       = ck::tensor_operation::element_wise::Relu;
 using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>;
 
 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
-static constexpr ck::index_t N             = 4;   // batch size
-static constexpr ck::index_t K             = 64;  // output channel
-static constexpr ck::index_t C             = 192; // input channel
-static constexpr ck::index_t Y             = 3;   // filter H
-static constexpr ck::index_t X             = 3;   // filter W
-static constexpr ck::index_t Hi            = 71;  // input H
-static constexpr ck::index_t Wi            = 71;  // input W
-static constexpr ck::index_t Ho            = 36;  // output H
-static constexpr ck::index_t Wo            = 36;  // output W
+static constexpr ck::index_t G             = 4;
+static constexpr ck::index_t N             = 4;  // batch size
+static constexpr ck::index_t K             = 32; // output channel
+static constexpr ck::index_t C             = 64; // input channel (per group)
+static constexpr ck::index_t Y             = 3;  // filter H
+static constexpr ck::index_t X             = 3;  // filter W
+static constexpr ck::index_t Hi            = 71; // input H
+static constexpr ck::index_t Wi            = 71; // input W
+static constexpr ck::index_t Ho            = 36; // output H
+static constexpr ck::index_t Wo            = 36; // output W
 struct SimpleDeviceMem
 {
     SimpleDeviceMem() = delete;
@@ -55,8 +55,11 @@ struct SimpleDeviceMem
 
 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
     std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
     std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
     std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
     std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
@@ -64,17 +67,18 @@ int main(int argc, char* argv[])
     std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
     std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
     std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
     std::array<ck::index_t, 2> in_left_pad{1, 1};
     std::array<ck::index_t, 2> in_right_pad{1, 1};
     std::array<ck::index_t, 2> conv_strides{2, 2};
     std::array<ck::index_t, 2> conv_dilations{1, 1};
 
-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
 
     using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
         NumDimSpatial,
diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
index b8e6a493e..2ff91fe96 100644
--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
@@ -16,19 +16,19 @@ using WeiDataType  = int8_t;
 using BiasDataType = int32_t;
 using OutDataType  = int8_t;
 
-using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout   = ck::tensor_layout::convolution::G_K;
-using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
 using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp = ck::tensor_operation::element_wise::Relu;
 using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
 
 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t K             = 32;   // output channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -55,23 +55,27 @@ struct SimpleDeviceMem
 
 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
     std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
     std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
     std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
     std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
     std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
     std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
     std::array<ck::index_t, 2> in_left_pad{1, 1};
     std::array<ck::index_t, 2> in_right_pad{1, 1};
     std::array<ck::index_t, 2> conv_strides{2, 2};
     std::array<ck::index_t, 2> conv_dilations{1, 1};
 
-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
 
     using DeviceOp =
         ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
diff --git a/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
index a0e1865d3..6ea5dd223 100644
--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
@@ -17,21 +17,21 @@ using BiasDataType         = int32_t;
 using RequantScaleDataType = float;
 using OutDataType          = int8_t;
 
-using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using InLayout           = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout         = ck::tensor_layout::convolution::G_K;
 using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp       = ck::tensor_operation::element_wise::TanH;
 using OutElementOp =
     ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>;
 
 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t K             = 32;   // output channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -58,8 +58,11 @@ struct SimpleDeviceMem
 
 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
     std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
     std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
     std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
     std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
@@ -67,17 +70,18 @@ int main(int argc, char* argv[])
     std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
     std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
     std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
     std::array<ck::index_t, 2> in_left_pad{1, 1};
     std::array<ck::index_t, 2> in_right_pad{1, 1};
     std::array<ck::index_t, 2> conv_strides{2, 2};
     std::array<ck::index_t, 2> conv_dilations{1, 1};
 
-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
 
     using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
         NumDimSpatial,
diff --git a/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
index 7637f5c78..33407c9a1 100644
--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
@@ -16,19 +16,19 @@ using WeiDataType  = int8_t;
 using BiasDataType = int32_t;
 using OutDataType  = int8_t;
 
-using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout   = ck::tensor_layout::convolution::G_K;
-using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
 using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp = ck::tensor_operation::element_wise::TanH;
 using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;
 
 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t K             = 32;   // output channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -56,23 +56,27 @@ struct SimpleDeviceMem
 
 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
     std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
     std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
     std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
     std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
     std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
     std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
     std::array<ck::index_t, 2> in_left_pad{1, 1};
     std::array<ck::index_t, 2> in_right_pad{1, 1};
     std::array<ck::index_t, 2> conv_strides{2, 2};
     std::array<ck::index_t, 2> conv_dilations{1, 1};
 
-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
 
     using DeviceOp =
         ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
diff --git a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
index 6439c22e7..6f5112470 100644
--- a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
@@ -16,25 +16,25 @@ using WeiDataType          = int8_t;
 using RequantScaleDataType = float;
 using OutDataType          = int8_t;
 
-using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using InLayout           = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
 using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp       = PassThrough;
 using OutElementOp       = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>;
 
 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
-static constexpr ck::index_t N             = 4;   // batch size
-static constexpr ck::index_t K             = 64;  // output channel
-static constexpr ck::index_t C             = 192; // input channel
-static constexpr ck::index_t Y             = 3;   // filter H
-static constexpr ck::index_t X             = 3;   // filter W
-static constexpr ck::index_t Hi            = 71;  // input H
-static constexpr ck::index_t Wi            = 71;  // input W
-static constexpr ck::index_t Ho            = 36;  // output H
-static constexpr ck::index_t Wo            = 36;  // output W
+static constexpr ck::index_t G             = 4;
+static constexpr ck::index_t N             = 4;  // batch size
+static constexpr ck::index_t K             = 32; // output channel
+static constexpr ck::index_t C             = 64; // input channel (per group)
+static constexpr ck::index_t Y             = 3;  // filter H
+static constexpr ck::index_t X             = 3;  // filter W
+static constexpr ck::index_t Hi            = 71; // input H
+static constexpr ck::index_t Wi            = 71; // input W
+static constexpr ck::index_t Ho            = 36; // output H
+static constexpr ck::index_t Wo            = 36; // output W
 
 struct SimpleDeviceMem
 {
@@ -54,23 +54,27 @@ struct SimpleDeviceMem
 
 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
     std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
     std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
     std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
     std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
     std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
     std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
     std::array<ck::index_t, 2> in_left_pad{1, 1};
     std::array<ck::index_t, 2> in_right_pad{1, 1};
     std::array<ck::index_t, 2> conv_strides{2, 2};
     std::array<ck::index_t, 2> conv_dilations{1, 1};
 
-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
 
     using DeviceOp =
         ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
diff --git a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
index f7c46a95f..6a11f9fc2 100644
--- a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
@@ -15,18 +15,18 @@ using InDataType  = int8_t;
 using WeiDataType = int8_t;
 using OutDataType = int8_t;
 
-using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
-using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
 using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp = PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
 
 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t K             = 32;   // output channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -53,20 +53,24 @@ struct SimpleDeviceMem
 
 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
     std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
     std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
     std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
     std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
     std::array<ck::index_t, 2> in_left_pad{1, 1};
     std::array<ck::index_t, 2> in_right_pad{1, 1};
     std::array<ck::index_t, 2> conv_strides{2, 2};
     std::array<ck::index_t, 2> conv_dilations{1, 1};
 
-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
 
     using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
                                                                                  InLayout,
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
index 1587c614d..5675db77f 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
@@ -190,11 +190,11 @@ int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_
     const auto in_element_op  = InElementOp{};
     const auto wei_element_op = WeiElementOp{};
 
-    using InLayout           = ck::tensor_layout::convolution::GNHWC;
-    using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
+    using InLayout           = ck::tensor_layout::convolution::NHWGC;
+    using WeiLayout          = ck::tensor_layout::convolution::KYXGC;
     using BiasLayout         = ck::tensor_layout::convolution::G_K;
     using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-    using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+    using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 
     const auto in_g_n_c_wis_desc =
         ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
index 455e0804d..9fd19c1c4 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
@@ -178,10 +178,10 @@ int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_el
     const auto in_element_op  = InElementOp{};
     const auto wei_element_op = WeiElementOp{};
 
-    using InLayout   = ck::tensor_layout::convolution::GNHWC;
-    using WeiLayout  = ck::tensor_layout::convolution::GKYXC;
+    using InLayout   = ck::tensor_layout::convolution::NHWGC;
+    using WeiLayout  = ck::tensor_layout::convolution::KYXGC;
     using BiasLayout = ck::tensor_layout::convolution::G_K;
-    using OutLayout  = ck::tensor_layout::convolution::GNHWK;
+    using OutLayout  = ck::tensor_layout::convolution::NHWGK;
 
     const auto in_g_n_c_wis_desc =
         ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
index 8e75c2774..cacedfdad 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
@@ -180,10 +180,10 @@ int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_eleme
     const auto in_element_op  = InElementOp{};
     const auto wei_element_op = WeiElementOp{};
 
-    using InLayout           = ck::tensor_layout::convolution::GNHWC;
-    using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
+    using InLayout           = ck::tensor_layout::convolution::NHWGC;
+    using WeiLayout          = ck::tensor_layout::convolution::KYXGC;
     using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-    using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+    using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 
     const auto in_g_n_c_wis_desc =
         ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
index 926c033c5..77332cb6d 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
@@ -162,9 +162,9 @@ int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element
     const auto in_element_op  = InElementOp{};
     const auto wei_element_op = WeiElementOp{};
 
-    using InLayout  = ck::tensor_layout::convolution::GNHWC;
-    using WeiLayout = ck::tensor_layout::convolution::GKYXC;
-    using OutLayout = ck::tensor_layout::convolution::GNHWK;
+    using InLayout  = ck::tensor_layout::convolution::NHWGC;
+    using WeiLayout = ck::tensor_layout::convolution::KYXGC;
+    using OutLayout = ck::tensor_layout::convolution::NHWGK;
 
     const auto in_g_n_c_wis_desc =
         ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index a8df7f0d5..175932e63 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -117,20 +117,6 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                               PassThrough,
                                                               PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
-
 void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                               GNHWC,
@@ -159,20 +145,21 @@ void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                               PassThrough,
                                                               PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances(
+// grouped conv2d forward, NHWGC/GKYXC/NHWGK
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               Empty_Tuple,
-                                                              GNHWK,
-                                                              int8_t,
-                                                              int8_t,
+                                                              NHWGK,
+                                                              BF16,
+                                                              BF16,
                                                               Empty_Tuple,
-                                                              int8_t,
+                                                              BF16,
                                                               PassThrough,
                                                               PassThrough,
                                                               PassThrough>>>& instances);
-// grouped conv2d forward, NHWGC/GKYXC/NHWGK
+
 void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                               NHWGC,
@@ -187,6 +174,20 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                               PassThrough,
                                                               PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              F32,
+                                                              F32,
+                                                              Empty_Tuple,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
 // grouped conv3d forward, GNDHWC/GKZYXC/GNDHWK
 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
@@ -385,12 +386,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
             {
                 add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(op_ptrs);
             }
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
-                              is_same_v<OutDataType, int8_t>)
-            {
-                add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances(op_ptrs);
-            }
         }
         else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                           is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NHWGK>)
@@ -398,7 +393,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
             if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                          is_same_v<OutDataType, float>)
             {
-                // no instance
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(op_ptrs);
             }
             else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                               is_same_v<OutDataType, half_t>)
@@ -409,12 +404,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                               is_same_v<WeiDataType, ck::bhalf_t> &&
                               is_same_v<OutDataType, ck::bhalf_t>)
             {
-                // no instance
-            }
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
-                              is_same_v<OutDataType, int8_t>)
-            {
-                // no instance
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(op_ptrs);
             }
         }
         else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, GNDHWC> &&
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
index 793dc8d04..daec48050 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
@@ -17,14 +17,14 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+// grouped conv2d forward, NHWGC/GKYXC/NHWGK
 void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(
     std::vector<
         std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                       GKYXC,
                                                       GK_GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                       int8_t,
                                                       int8_t,
                                                       I32_F32_Tuple,
@@ -36,10 +36,10 @@ void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(
 
 void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_F32_Tuple,
@@ -52,10 +52,10 @@ void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
 void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(
     std::vector<
         std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                       GKYXC,
                                                       GK_GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                       int8_t,
                                                       int8_t,
                                                       I32_F32_Tuple,
@@ -68,10 +68,10 @@ void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(
 void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
     std::vector<
         std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                       GKYXC,
                                                       GK_GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                       int8_t,
                                                       int8_t,
                                                       I32_F32_Tuple,
@@ -83,10 +83,10 @@ void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
 
 void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_F32_Tuple,
@@ -99,10 +99,10 @@ void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
 void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(
     std::vector<
         std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                       GKYXC,
                                                       GK_GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                       int8_t,
                                                       int8_t,
                                                       I32_F32_Tuple,
@@ -154,9 +154,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                      is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_GK_Tuple> &&
-                     is_same_v<OutLayout, GNHWK>)
+                     is_same_v<OutLayout, NHWGK>)
         {
             if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                          is_same_v<DsDataType, I32_F32_Tuple> && is_same_v<OutDataType, int8_t>)
@@ -220,9 +220,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                      is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_GK_Tuple> &&
-                     is_same_v<OutLayout, GNHWK>)
+                     is_same_v<OutLayout, NHWGK>)
         {
             if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                          is_same_v<DsDataType, I32_F32_Tuple> && is_same_v<OutDataType, int8_t>)
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
index c570f7675..b7d81021e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
@@ -17,14 +17,14 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+// grouped conv2d forward, NHWGC/GKYXC/NHWGK
 void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(
     std::vector<
         std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                       GKYXC,
                                                       GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                       int8_t,
                                                       int8_t,
                                                       I32_Tuple,
@@ -36,10 +36,10 @@ void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(
 
 void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_Tuple,
@@ -51,10 +51,10 @@ void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
 
 void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_Tuple,
@@ -67,10 +67,10 @@ void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(
 void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
     std::vector<
         std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                       GKYXC,
                                                       GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                       int8_t,
                                                       int8_t,
                                                       I32_Tuple,
@@ -82,10 +82,10 @@ void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
 
 void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_Tuple,
@@ -97,10 +97,10 @@ void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
 
 void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_Tuple,
@@ -152,9 +152,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                      is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_Tuple> &&
-                     is_same_v<OutLayout, GNHWK>)
+                     is_same_v<OutLayout, NHWGK>)
         {
             if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                          is_same_v<DsDataType, I32_Tuple> && is_same_v<OutDataType, int8_t>)
@@ -218,9 +218,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                      is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_Tuple> &&
-                     is_same_v<OutLayout, GNHWK>)
+                     is_same_v<OutLayout, NHWGK>)
         {
             if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                          is_same_v<DsDataType, I32_Tuple> && is_same_v<OutDataType, int8_t>)
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
index 089343fe6..2d54879ea 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
@@ -17,13 +17,13 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+// grouped conv2d forward, NHWGC/GKYXC/NHWGK
 void add_device_conv2d_dl_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               F32_Tuple,
@@ -35,10 +35,10 @@ void add_device_conv2d_dl_perchannel_quantization_int8_instances(
 
 void add_device_conv2d_dl_relu_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               F32_Tuple,
@@ -50,10 +50,10 @@ void add_device_conv2d_dl_relu_perchannel_quantization_int8_instances(
 
 void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               F32_Tuple,
@@ -65,10 +65,10 @@ void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
 
 void add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               F32_Tuple,
@@ -119,9 +119,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                      is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_Tuple> &&
-                     is_same_v<OutLayout, GNHWK>)
+                     is_same_v<OutLayout, NHWGK>)
         {
             if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                          is_same_v<OutDataType, int8_t>)
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
index e570027eb..f278cfa22 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
@@ -17,13 +17,13 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+// grouped conv2d forward, NHWGC/GKYXC/NHWGK
 void add_device_conv2d_dl_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               Empty_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               Empty_Tuple,
@@ -35,10 +35,10 @@ void add_device_conv2d_dl_perlayer_quantization_int8_instances(
 
 void add_device_conv2d_dl_relu_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               Empty_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               Empty_Tuple,
@@ -50,10 +50,10 @@ void add_device_conv2d_dl_relu_perlayer_quantization_int8_instances(
 
 void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               Empty_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               Empty_Tuple,
@@ -65,10 +65,10 @@ void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
 
 void add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               Empty_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               Empty_Tuple,
@@ -117,8 +117,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
-                     is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, GNHWK>)
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
+                     is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NHWGK>)
         {
             if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                          is_same_v<OutDataType, int8_t>)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
index 5ef1b6866..a36e1b47c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -3,11 +3,11 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
    device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
    device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
-   device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
    # NHWGC, GKYXC, NHWGK
+   device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
    device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
-   #dl 
+   device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+   #dl
    device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
    device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
-   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_common.hpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_common.hpp
new file mode 100644
index 000000000..b4de825fb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_common.hpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWGC = ck::tensor_layout::convolution::NHWGC;
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+
+using NHWGK = ck::tensor_layout::convolution::NHWGK;
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto ConvFwdOddC =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index fc18b3c73..f7e575df2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,100 +1,54 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "device_grouped_conv2d_fwd_dl_instance.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using InDataType  = ck::half_t;
-using WeiDataType = ck::half_t;
-using AccDataType = float;
-using OutDataType = ck::half_t;
-
-using Empty_Tuple = ck::Tuple<>;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-
-using InLayout  = ck::tensor_layout::convolution::GNHWC;
-using WeiLayout = ck::tensor_layout::convolution::GKYXC;
-using OutLayout = ck::tensor_layout::convolution::GNHWK;
-
-static constexpr auto ConvSpec =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-static constexpr auto Filter1x1Pad0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
-static constexpr auto Filter1x1Stride1Pad0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
-
-static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances = std::tuple<
-    // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-    // clang-format on
-    >;
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_Filter1x1Pad0_instances = std::tuple<
-    // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,  Filter1x1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-    // clang-format on
-    >;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_Filter1x1Stride1Pad0_instances =
-    std::tuple<
-        // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|          Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|              Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation|       Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |                     |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp, Filter1x1Stride1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-        // clang-format on
-        >;
-
 void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              InLayout,
-                                                              WeiLayout,
+                                                              GNHWC,
+                                                              GKYXC,
                                                               Empty_Tuple,
-                                                              OutLayout,
-                                                              InDataType,
-                                                              WeiDataType,
+                                                              GNHWK,
+                                                              F16,
+                                                              F16,
                                                               Empty_Tuple,
-                                                              OutDataType,
-                                                              InElementOp,
-                                                              WeiElementOp,
-                                                              OutElementOp>>>& instances)
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances{});
+                                   device_grouped_conv2d_fwd_dl_f16_instances<GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwdDefault>{});
 
-    add_device_operation_instances(
-        instances, device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_Filter1x1Pad0_instances{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_f16_instances<GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwd1x1P0>{});
 
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_Filter1x1Stride1Pad0_instances{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_f16_instances<GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwd1x1S1P0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index 648b39637..85300b4e4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,104 +1,54 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "device_grouped_conv2d_fwd_dl_instance.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using InDataType  = float;
-using WeiDataType = float;
-using AccDataType = float;
-using OutDataType = float;
-
-using Empty_Tuple = ck::Tuple<>;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-
-using InLayout  = ck::tensor_layout::convolution::GNHWC;
-using WeiLayout = ck::tensor_layout::convolution::GKYXC;
-using OutLayout = ck::tensor_layout::convolution::GNHWK;
-
-static constexpr auto ConvSpec =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-static constexpr auto Filter1x1Pad0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
-static constexpr auto Filter1x1Stride1Pad0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
-
-static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances = std::tuple<
-    // clang-format off
-           // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-    // clang-format on
-    >;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_Filter1x1Pad0_instances = std::tuple<
-    // clang-format off
-           // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,  Filter1x1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-    // clang-format on
-    >;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_Filter1x1Stride1Pad0_instances =
-    std::tuple<
-        // clang-format off
-           // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|          Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|              Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation|       Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |                     |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp, Filter1x1Stride1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-        // clang-format on
-        >;
-
 void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              InLayout,
-                                                              WeiLayout,
+                                                              GNHWC,
+                                                              GKYXC,
                                                               Empty_Tuple,
-                                                              OutLayout,
-                                                              InDataType,
-                                                              WeiDataType,
+                                                              GNHWK,
+                                                              F32,
+                                                              F32,
                                                               Empty_Tuple,
-                                                              OutDataType,
-                                                              InElementOp,
-                                                              WeiElementOp,
-                                                              OutElementOp>>>& instances)
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances{});
+                                   device_grouped_conv2d_fwd_dl_f32_instances<GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwdDefault>{});
 
-    add_device_operation_instances(
-        instances, device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_Filter1x1Pad0_instances{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_f32_instances<GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwd1x1P0>{});
 
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_Filter1x1Stride1Pad0_instances{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_f32_instances<GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwd1x1S1P0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
deleted file mode 100644
index 1cb5d0699..000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using InDataType  = int8_t;
-using WeiDataType = int8_t;
-using AccDataType = int32_t;
-using OutDataType = int8_t;
-
-using Empty_Tuple = ck::Tuple<>;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-
-using InLayout  = ck::tensor_layout::convolution::GNHWC;
-using WeiLayout = ck::tensor_layout::convolution::GKYXC;
-using OutLayout = ck::tensor_layout::convolution::GNHWK;
-
-static constexpr auto ConvSpec =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-static constexpr auto Filter1x1Pad0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
-static constexpr auto Filter1x1Stride1Pad0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
-
-static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances = std::tuple<
-    // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-    // clang-format on
-    >;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_Filter1x1Pad0_instances = std::tuple<
-    // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,  Filter1x1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-    // clang-format on
-    >;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_Filter1x1Stride1Pad0_instances =
-    std::tuple<
-        // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|          Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|              Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation|       Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |                     |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp, Filter1x1Stride1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-        // clang-format on
-        >;
-
-void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              InLayout,
-                                                              WeiLayout,
-                                                              Empty_Tuple,
-                                                              OutLayout,
-                                                              InDataType,
-                                                              WeiDataType,
-                                                              Empty_Tuple,
-                                                              OutDataType,
-                                                              InElementOp,
-                                                              WeiElementOp,
-                                                              OutElementOp>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances{});
-
-    add_device_operation_instances(
-        instances, device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_Filter1x1Pad0_instances{});
-
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_Filter1x1Stride1Pad0_instances{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_instance.hpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_instance.hpp
new file mode 100644
index 000000000..bcda22006
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_instance.hpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+#include "device_grouped_conv2d_fwd_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout,
+          typename DsDatatype,
+          typename CDEElementOp,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv2d_fwd_dl_f16_instances = std::tuple<
+    // clang-format off
+           // ########################################|        NDim| InData| WeiData|    MultpleD| OutData| AccData| InLayout| WeiLayout| MultipleD| OutLayout|          In|          Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+           // ########################################|     Spatial|   Type|    Type|        Type|    Type|    Type|         |          |    Layout|          | Elementwise|  Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+           // ########################################|            |       |        |            |        |        |         |          |          |          |   Operation|    Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+           // ########################################|            |       |        |            |        |        |         |          |          |          |            |             |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2,    F16,     F16,  DsDatatype,     F16,     F32, InLayout, WeiLayout,  DsLayout, OutLayout, PassThrough,  PassThrough,  CDEElementOp,       ConvSpec,    GemmMNKPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+
+template <typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout,
+          typename DsDatatype,
+          typename CDEElementOp,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv2d_fwd_dl_f32_instances = std::tuple<
+    // clang-format off
+        // clang-format off
+           // ########################################|        NDim| InData| WeiData|    MultpleD| OutData| AccData| InLayout| WeiLayout| MultipleD| OutLayout|          In|          Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+           // ########################################|     Spatial|   Type|    Type|        Type|    Type|    Type|         |          |    Layout|          | Elementwise|  Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+           // ########################################|            |       |        |            |        |        |         |          |          |          |   Operation|    Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+           // ########################################|            |       |        |            |        |        |         |          |          |          |            |             |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2,    F32,     F32,  DsDatatype,     F32,     F32, InLayout, WeiLayout,  DsLayout, OutLayout, PassThrough, PassThrough,  CDEElementOp,       ConvSpec,    GemmMNKPadding,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
index 29f331031..40593a0ef 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -1,137 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "device_grouped_conv2d_fwd_xdl_instance.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-
-using BF16 = ck::bhalf_t;
-using F32  = float;
-
-using Empty_Tuple = ck::Tuple<>;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using GNHWC = ck::tensor_layout::convolution::GNHWC;
-using GKYXC = ck::tensor_layout::convolution::GKYXC;
-using GNHWK = ck::tensor_layout::convolution::GNHWK;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-
-static constexpr auto ConvFwd1x1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
-
-static constexpr auto ConvFwd1x1S1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
-
-static constexpr auto ConvFwdOddC =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC;
-
-static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
 // Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
-using device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances =
-    std::tuple<
-        // clang-format off
-        // Default
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // Filter1x1Pad0
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // Filter1x1Stride1Pad0
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // OddC
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,    64,    32,   8,   8,   32,   32,    4,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    1,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>
-        // clang-format on
-        >;
-
 void add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                               GNHWC,
@@ -146,8 +23,41 @@ void add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
                                                               PassThrough,
                                                               PassThrough>>>& instances)
 {
-    add_device_operation_instances(
-        instances, device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_bf16_instances<GNHWC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                GNHWK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_bf16_instances<GNHWC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                GNHWK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwd1x1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_bf16_instances<GNHWC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                GNHWK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_bf16_instances<GNHWC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                GNHWK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index 6a4a3d2a4..7088028bf 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,137 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "device_grouped_conv2d_fwd_xdl_instance.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using Empty_Tuple = ck::Tuple<>;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using GNHWC = ck::tensor_layout::convolution::GNHWC;
-using GKYXC = ck::tensor_layout::convolution::GKYXC;
-using GNHWK = ck::tensor_layout::convolution::GNHWK;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-
-static constexpr auto ConvFwd1x1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
-
-static constexpr auto ConvFwd1x1S1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
-
-static constexpr auto ConvFwdOddC =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC;
-
-static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
 // Compilation parameters for in[g, n, hi ,wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
-using device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances =
-    std::tuple<
-        // clang-format off
-        // Default
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // Filter1x1Pad0
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // Filter1x1Stride1Pad0
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // OddC
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,    64,    32,   8,   8,   32,   32,    4,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    1,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>
-        // clang-format on
-        >;
-
 void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                               GNHWC,
@@ -147,7 +24,40 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
                                                               PassThrough>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances{});
+                                   device_grouped_conv2d_fwd_xdl_f16_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f16_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwd1x1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f16_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f16_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index 1fec35fd9..919274c50 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,109 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "device_grouped_conv2d_fwd_xdl_instance.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-
-using F32 = float;
-
-using Empty_Tuple = ck::Tuple<>;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using GNHWC = ck::tensor_layout::convolution::GNHWC;
-using GKYXC = ck::tensor_layout::convolution::GKYXC;
-using GNHWK = ck::tensor_layout::convolution::GNHWK;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-
-static constexpr auto ConvFwd1x1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
-
-static constexpr auto ConvFwd1x1S1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
-
-static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
 // Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
-using device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances =
-    std::tuple<
-        // clang-format off
-        // Default
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-
-        // Filter1x1Pad0
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-
-        // Filter1x1Stride1Pad0
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
-        // clang-format on
-        >;
-
 void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                               GNHWC,
@@ -119,7 +24,40 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                               PassThrough>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances{});
+                                   device_grouped_conv2d_fwd_xdl_f32_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f32_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwd1x1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f32_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f32_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
deleted file mode 100644
index 59b012134..000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using Empty_Tuple = ck::Tuple<>;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using GNHWC = ck::tensor_layout::convolution::GNHWC;
-using GKYXC = ck::tensor_layout::convolution::GKYXC;
-using GNHWK = ck::tensor_layout::convolution::GNHWK;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-
-static constexpr auto ConvFwd1x1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
-
-static constexpr auto ConvFwd1x1S1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
-
-static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
-using device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instances = std::tuple<
-    // clang-format off
-        // Default
-        //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // Filter1x1Pad0
-        //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-                                                                                                                                                                                                                
-        // Filter1x1Stride1Pad0                                                                                                                                                                                 
-        //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
-    // clang-format on
-    >;
-
-void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instances{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_instance.hpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_instance.hpp
new file mode 100644
index 000000000..2858671ee
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_instance.hpp
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "device_grouped_conv2d_fwd_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename DsDatatype,
+          typename CDEElementOp,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv2d_fwd_xdl_f16_instances =
+    std::tuple<
+        // clang-format off
+        //########################################|  NumDim|       A|       B|       Ds|       E| AData| BData| AccData| CShuffle|         Ds| EData|           A|           B|          CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial|  Layout|  Layout|   Layout|  Layout|  Type|  Type|    Type| DataType|   DataType|  Type| Elementwise| Elementwise|  Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |        |        |         |        |      |      |        |         |           |      |   Operation|   Operation|    Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |        |        |         |        |      |      |        |         |           |      |            |            |             |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        // clang-format on
+        >;
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename DsDatatype,
+          typename CDEElementOp,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv2d_fwd_xdl_bf16_instances =
+    std::tuple<
+        // clang-format off
+        //########################################|  NumDim|       A|       B|       Ds|       E| AData| BData| AccData| CShuffle|         Ds| EData|           A|           B|          CDE|    ConvForward|            GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial|  Layout|  Layout|   Layout|  Layout|  Type|  Type|    Type| DataType|   DataType|  Type| Elementwise| Elementwise|  Elementwise| Specialization|  Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |        |        |         |        |      |      |        |         |           |      |   Operation|   Operation|    Operation|               |                |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |        |        |         |        |      |      |        |         |           |      |            |            |             |               |                |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        // clang-format on
+        >;
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename DsDatatype,
+          typename CDEElementOp,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv2d_fwd_xdl_f32_instances =
+    std::tuple<
+        // clang-format off
+        //########################################|  NumDim|       A|       B|       Ds|       E| AData| BData| AccData| CShuffle|         Ds| EData|           A|           B|          CDE|    ConvForward|            GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial|  Layout|  Layout|   Layout|  Layout|  Type|  Type|    Type| DataType|   DataType|  Type| Elementwise| Elementwise|  Elementwise| Specialization|  Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |        |        |         |        |      |      |        |         |           |      |   Operation|   Operation|    Operation|               |                |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |        |        |         |        |      |      |        |         |           |      |            |            |             |               |                |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
+        // clang-format on
+        >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
new file mode 100644
index 000000000..25caf61df
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "device_grouped_conv2d_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              BF16,
+                                                              BF16,
+                                                              Empty_Tuple,
+                                                              BF16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_bf16_instances<NHWGC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                NHWGK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_bf16_instances<NHWGC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                NHWGK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwd1x1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_bf16_instances<NHWGC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                NHWGK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_bf16_instances<NHWGC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                NHWGK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index 8aca73043..b997cfb67 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -1,137 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "device_grouped_conv2d_fwd_xdl_instance.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using Empty_Tuple = ck::Tuple<>;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using NHWGC = ck::tensor_layout::convolution::NHWGC;
-using GKYXC = ck::tensor_layout::convolution::GKYXC;
-using NHWGK = ck::tensor_layout::convolution::NHWGK;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-
-static constexpr auto ConvFwd1x1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
-
-static constexpr auto ConvFwd1x1S1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
-
-static constexpr auto ConvFwdOddC =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC;
-
-static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-using device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances =
-    std::tuple<
-        // clang-format off
-        // Default
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // Filter1x1Pad0
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // Filter1x1Stride1Pad0
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // OddC
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,    64,    32,   8,   8,   32,   32,    4,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    1,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>
-        // clang-format on
-        >;
-
 void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                               NHWGC,
@@ -147,7 +24,40 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                               PassThrough>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances{});
+                                   device_grouped_conv2d_fwd_xdl_f16_instances<NHWGC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               NHWGK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f16_instances<NHWGC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               NHWGK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwd1x1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f16_instances<NHWGC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               NHWGK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f16_instances<NHWGC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               NHWGK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
new file mode 100644
index 000000000..3256a2a82
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "device_grouped_conv2d_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              F32,
+                                                              F32,
+                                                              Empty_Tuple,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f32_instances<NHWGC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               NHWGK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f32_instances<NHWGC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               NHWGK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwd1x1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f32_instances<NHWGC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               NHWGK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f32_instances<NHWGC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               NHWGK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
index b231f8c95..672cdba65 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
@@ -19,9 +19,9 @@ using Empty_Tuple = ck::Tuple<>;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
-using GNHWC       = ck::tensor_layout::convolution::GNHWC;
+using NHWGC       = ck::tensor_layout::convolution::NHWGC;
 using GKYXC       = ck::tensor_layout::convolution::GKYXC;
-using GNHWK       = ck::tensor_layout::convolution::GNHWK;
+using NHWGK       = ck::tensor_layout::convolution::NHWGK;
 using GK          = ck::tensor_layout::convolution::G_K;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using Relu        = ck::tensor_operation::element_wise::Relu;
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
index ae5c1d7c3..d4b5484d8 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
@@ -9,10 +9,10 @@ namespace device {
 namespace instance {
 void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_F32_Tuple,
@@ -23,19 +23,28 @@ void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(
 {
     // dl
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_F32_Tuple,
                                                                            Add_Mul2_Clamp,
                                                                            ConvFwdDefault,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_F32_Tuple,
                                                                            Add_Mul2_Clamp,
                                                                            ConvFwd1x1P0,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_F32_Tuple,
                                                                            Add_Mul2_Clamp,
                                                                            ConvFwd1x1S1P0,
@@ -44,10 +53,10 @@ void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(
 
 void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_F32_Tuple,
@@ -58,19 +67,28 @@ void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
 {
     // dl
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_F32_Tuple,
                                                                            Add_Relu_Mul2_Clamp,
                                                                            ConvFwdDefault,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_F32_Tuple,
                                                                            Add_Relu_Mul2_Clamp,
                                                                            ConvFwd1x1P0,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_F32_Tuple,
                                                                            Add_Relu_Mul2_Clamp,
                                                                            ConvFwd1x1S1P0,
@@ -79,10 +97,10 @@ void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
 
 void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_F32_Tuple,
@@ -93,19 +111,28 @@ void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(
 {
     // dl
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_F32_Tuple,
                                                                            Add_Mul2_TanH_Mul_Clamp,
                                                                            ConvFwdDefault,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_F32_Tuple,
                                                                            Add_Mul2_TanH_Mul_Clamp,
                                                                            ConvFwd1x1P0,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_F32_Tuple,
                                                                            Add_Mul2_TanH_Mul_Clamp,
                                                                            ConvFwd1x1S1P0,
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
index 192d5c9a5..7db4b8d86 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
@@ -9,10 +9,10 @@ namespace device {
 namespace instance {
 void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_Tuple,
@@ -22,19 +22,28 @@ void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(
                                                               Add_Mul_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_Tuple,
                                                                            Add_Mul_Clamp,
                                                                            ConvFwdDefault,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_Tuple,
                                                                            Add_Mul_Clamp,
                                                                            ConvFwd1x1P0,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_Tuple,
                                                                            Add_Mul_Clamp,
                                                                            ConvFwd1x1S1P0,
@@ -43,10 +52,10 @@ void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(
 
 void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_Tuple,
@@ -56,21 +65,30 @@ void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
                                                               Add_Relu_Mul_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_Tuple,
                                                                            Add_Relu_Mul_Clamp,
                                                                            ConvFwdDefault,
                                                                            4>{});
 
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_Tuple,
                                                                            Add_Relu_Mul_Clamp,
                                                                            ConvFwd1x1P0,
                                                                            4>{});
 
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_Tuple,
                                                                            Add_Relu_Mul_Clamp,
                                                                            ConvFwd1x1S1P0,
@@ -79,10 +97,10 @@ void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
 
 void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_Tuple,
@@ -92,21 +110,30 @@ void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(
                                                               Add_Mul_TanH_Mul_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_Tuple,
                                                                            Add_Mul_TanH_Mul_Clamp,
                                                                            ConvFwdDefault,
                                                                            4>{});
 
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_Tuple,
                                                                            Add_Mul_TanH_Mul_Clamp,
                                                                            ConvFwd1x1P0,
                                                                            4>{});
 
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_Tuple,
+                                                                           NHWGK,
                                                                            I32_Tuple,
                                                                            Add_Mul_TanH_Mul_Clamp,
                                                                            ConvFwd1x1S1P0,
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp
index 3c4987f15..7eefbe038 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp
@@ -12,7 +12,10 @@ namespace device {
 namespace instance {
 
 // clang-format off
-template <typename DsLayout,
+template <typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout,
           typename DsDatatype,
           typename OutElementOp,
           ConvolutionForwardSpecialization ConvSpec,
@@ -23,7 +26,7 @@ using device_grouped_conv2d_dl_int8_instances =
         // ###########################################|     Spatial|   Type|    Type|       Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
         // ###########################################|            |       |        |           |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
         // ###########################################|            |       |        |           |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK< NDimSpatial, int8_t,  int8_t, DsDatatype,      int8_t,     int32_t,    GNHWC,     GKYXC,    DsLayout,     GNHWK,  PassThrough,   PassThrough,  OutElementOp,       ConvSpec,          GemmSpec,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5, DstScalarPerVector>
+        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK< NDimSpatial, int8_t,  int8_t, DsDatatype,      int8_t,     int32_t, InLayout, WeiLayout,    DsLayout, OutLayout,  PassThrough,   PassThrough,  OutElementOp,       ConvSpec,          GemmSpec,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5, DstScalarPerVector>
     >;
 // clang-format on
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
index d45c1c1ee..c8f5f7042 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
@@ -9,10 +9,10 @@ namespace device {
 namespace instance {
 void add_device_conv2d_dl_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               F32_Tuple,
@@ -22,19 +22,28 @@ void add_device_conv2d_dl_perchannel_quantization_int8_instances(
                                                               Mul2_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_Tuple,
+                                                                           NHWGK,
                                                                            F32_Tuple,
                                                                            Mul2_Clamp,
                                                                            ConvFwdDefault,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_Tuple,
+                                                                           NHWGK,
                                                                            F32_Tuple,
                                                                            Mul2_Clamp,
                                                                            ConvFwd1x1P0,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_Tuple,
+                                                                           NHWGK,
                                                                            F32_Tuple,
                                                                            Mul2_Clamp,
                                                                            ConvFwd1x1S1P0,
@@ -43,10 +52,10 @@ void add_device_conv2d_dl_perchannel_quantization_int8_instances(
 
 void add_device_conv2d_dl_relu_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               F32_Tuple,
@@ -56,19 +65,28 @@ void add_device_conv2d_dl_relu_perchannel_quantization_int8_instances(
                                                               Relu_Mul2_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_Tuple,
+                                                                           NHWGK,
                                                                            F32_Tuple,
                                                                            Relu_Mul2_Clamp,
                                                                            ConvFwdDefault,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_Tuple,
+                                                                           NHWGK,
                                                                            F32_Tuple,
                                                                            Relu_Mul2_Clamp,
                                                                            ConvFwd1x1P0,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           GK_Tuple,
+                                                                           NHWGK,
                                                                            F32_Tuple,
                                                                            Relu_Mul2_Clamp,
                                                                            ConvFwd1x1S1P0,
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
index f4b947950..d7f7384ff 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
@@ -9,10 +9,10 @@ namespace device {
 namespace instance {
 void add_device_conv2d_dl_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               Empty_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               Empty_Tuple,
@@ -22,19 +22,28 @@ void add_device_conv2d_dl_perlayer_quantization_int8_instances(
                                                               Mul_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<Empty_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           Empty_Tuple,
+                                                                           NHWGK,
                                                                            Empty_Tuple,
                                                                            Mul_Clamp,
                                                                            ConvFwdDefault,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<Empty_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           Empty_Tuple,
+                                                                           NHWGK,
                                                                            Empty_Tuple,
                                                                            Mul_Clamp,
                                                                            ConvFwd1x1P0,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<Empty_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           Empty_Tuple,
+                                                                           NHWGK,
                                                                            Empty_Tuple,
                                                                            Mul_Clamp,
                                                                            ConvFwd1x1S1P0,
@@ -43,10 +52,10 @@ void add_device_conv2d_dl_perlayer_quantization_int8_instances(
 
 void add_device_conv2d_dl_relu_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               Empty_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               Empty_Tuple,
@@ -56,19 +65,28 @@ void add_device_conv2d_dl_relu_perlayer_quantization_int8_instances(
                                                               Relu_Mul_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<Empty_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           Empty_Tuple,
+                                                                           NHWGK,
                                                                            Empty_Tuple,
                                                                            Relu_Mul_Clamp,
                                                                            ConvFwdDefault,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<Empty_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           Empty_Tuple,
+                                                                           NHWGK,
                                                                            Empty_Tuple,
                                                                            Relu_Mul_Clamp,
                                                                            ConvFwd1x1P0,
                                                                            4>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_dl_int8_instances<Empty_Tuple,
+                                   device_grouped_conv2d_dl_int8_instances<NHWGC,
+                                                                           GKYXC,
+                                                                           Empty_Tuple,
+                                                                           NHWGK,
                                                                            Empty_Tuple,
                                                                            Relu_Mul_Clamp,
                                                                            ConvFwd1x1S1P0,
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
index b6e8ee159..658aa8370 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
@@ -9,10 +9,10 @@ namespace device {
 namespace instance {
 void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_F32_Tuple,
@@ -22,19 +22,28 @@ void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
                                                               Add_Mul2_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_F32_Tuple,
                                                                             Add_Mul2_Clamp,
                                                                             ConvFwdDefault,
                                                                             8>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_F32_Tuple,
                                                                             Add_Mul2_Clamp,
                                                                             ConvFwd1x1P0,
                                                                             8>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_F32_Tuple,
                                                                             Add_Mul2_Clamp,
                                                                             ConvFwd1x1S1P0,
@@ -43,10 +52,10 @@ void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
 
 void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_F32_Tuple,
@@ -56,19 +65,28 @@ void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
                                                               Add_Relu_Mul2_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_F32_Tuple,
                                                                             Add_Relu_Mul2_Clamp,
                                                                             ConvFwdDefault,
                                                                             8>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_F32_Tuple,
                                                                             Add_Relu_Mul2_Clamp,
                                                                             ConvFwd1x1P0,
                                                                             8>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_F32_Tuple,
                                                                             Add_Relu_Mul2_Clamp,
                                                                             ConvFwd1x1S1P0,
@@ -77,10 +95,10 @@ void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
 
 void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_F32_Tuple,
@@ -90,19 +108,28 @@ void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(
                                                               Add_Mul2_TanH_Mul_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_F32_Tuple,
                                                                             Add_Mul2_TanH_Mul_Clamp,
                                                                             ConvFwdDefault,
                                                                             8>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_F32_Tuple,
                                                                             Add_Mul2_TanH_Mul_Clamp,
                                                                             ConvFwd1x1P0,
                                                                             8>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_F32_Tuple,
                                                                             Add_Mul2_TanH_Mul_Clamp,
                                                                             ConvFwd1x1S1P0,
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
index 70f92cec3..7102e9b25 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
@@ -9,10 +9,10 @@ namespace device {
 namespace instance {
 void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_Tuple,
@@ -22,19 +22,28 @@ void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
                                                               Add_Mul_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_Tuple,
                                                                             Add_Mul_Clamp,
                                                                             ConvFwdDefault,
                                                                             8>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_Tuple,
                                                                             Add_Mul_Clamp,
                                                                             ConvFwd1x1P0,
                                                                             8>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_Tuple,
                                                                             Add_Mul_Clamp,
                                                                             ConvFwd1x1S1P0,
@@ -43,10 +52,10 @@ void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
 
 void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_Tuple,
@@ -56,21 +65,30 @@ void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
                                                               Add_Relu_Mul_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_Tuple,
                                                                             Add_Relu_Mul_Clamp,
                                                                             ConvFwdDefault,
                                                                             8>{});
 
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_Tuple,
                                                                             Add_Relu_Mul_Clamp,
                                                                             ConvFwd1x1P0,
                                                                             8>{});
 
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_Tuple,
                                                                             Add_Relu_Mul_Clamp,
                                                                             ConvFwd1x1S1P0,
@@ -79,10 +97,10 @@ void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
 
 void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               I32_Tuple,
@@ -92,21 +110,30 @@ void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(
                                                               Add_Mul_TanH_Mul_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_Tuple,
                                                                             Add_Mul_TanH_Mul_Clamp,
                                                                             ConvFwdDefault,
                                                                             8>{});
 
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_Tuple,
                                                                             Add_Mul_TanH_Mul_Clamp,
                                                                             ConvFwd1x1P0,
                                                                             8>{});
 
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_Tuple,
+                                                                            NHWGK,
                                                                             I32_Tuple,
                                                                             Add_Mul_TanH_Mul_Clamp,
                                                                             ConvFwd1x1S1P0,
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp
index 262ec06b7..90f8791aa 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp
@@ -12,30 +12,33 @@ namespace device {
 namespace instance {
 
 // clang-format off
-template <typename DsLayout,
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
           typename DsDatatype,
           typename OutElementOp,
           ConvolutionForwardSpecialization ConvSpec,
           index_t DstScalarPerVector>
 using device_grouped_conv2d_xdl_int8_instances =
     std::tuple <
-        //########################################|     NumDim|      A|      B|       Ds|      E|  AData|  BData| AccData| CShuffle|         Ds|  EData|           A|           B|          CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|     CBlockTransfer|
-        //########################################|    Spatial| Layout| Layout|   Layout| Layout|   Type|   Type|    Type| DataType|   DataType|   Type| Elementwise| Elementwise|  Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl|    ScalarPerVector|
-        //########################################|           |       |       |         |       |       |       |        |         |           |       |   Operation|   Operation|    Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|      _NWaveNPerXdl|
-        //########################################|           |       |       |         |       |       |       |        |         |           |       |            |            |             |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                   |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>, DstScalarPerVector>
+        //########################################|     NumDim|       A|       B|       Ds|       E|  AData|  BData| AccData| CShuffle|         Ds|  EData|           A|           B|          CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|     CBlockTransfer|
+        //########################################|    Spatial|  Layout|  Layout|   Layout|  Layout|   Type|   Type|    Type| DataType|   DataType|   Type| Elementwise| Elementwise|  Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl|    ScalarPerVector|
+        //########################################|           |        |        |         |        |       |       |        |         |           |       |   Operation|   Operation|    Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|      _NWaveNPerXdl|
+        //########################################|           |        |        |         |        |       |       |        |         |           |       |            |            |             |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                   |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>, DstScalarPerVector>
     >;
 // clang-format on
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
index b0f86b5c2..9d6937708 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
@@ -9,10 +9,10 @@ namespace device {
 namespace instance {
 void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               F32_Tuple,
@@ -22,19 +22,28 @@ void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
                                                               Mul2_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_Tuple,
+                                                                            NHWGK,
                                                                             F32_Tuple,
                                                                             Mul2_Clamp,
                                                                             ConvFwdDefault,
                                                                             8>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_Tuple,
+                                                                            NHWGK,
                                                                             F32_Tuple,
                                                                             Mul2_Clamp,
                                                                             ConvFwd1x1P0,
                                                                             8>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_Tuple,
+                                                                            NHWGK,
                                                                             F32_Tuple,
                                                                             Mul2_Clamp,
                                                                             ConvFwd1x1S1P0,
@@ -43,10 +52,10 @@ void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
 
 void add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               F32_Tuple,
@@ -56,19 +65,28 @@ void add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances(
                                                               Relu_Mul2_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_Tuple,
+                                                                            NHWGK,
                                                                             F32_Tuple,
                                                                             Relu_Mul2_Clamp,
                                                                             ConvFwdDefault,
                                                                             8>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_Tuple,
+                                                                            NHWGK,
                                                                             F32_Tuple,
                                                                             Relu_Mul2_Clamp,
                                                                             ConvFwd1x1P0,
                                                                             8>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            GK_Tuple,
+                                                                            NHWGK,
                                                                             F32_Tuple,
                                                                             Relu_Mul2_Clamp,
                                                                             ConvFwd1x1S1P0,
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
index 4812ceecf..d6f87335b 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
@@ -9,10 +9,10 @@ namespace device {
 namespace instance {
 void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               Empty_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               Empty_Tuple,
@@ -22,19 +22,28 @@ void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
                                                               Mul_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            Empty_Tuple,
+                                                                            NHWGK,
                                                                             Empty_Tuple,
                                                                             Mul_Clamp,
                                                                             ConvFwdDefault,
                                                                             16>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            Empty_Tuple,
+                                                                            NHWGK,
                                                                             Empty_Tuple,
                                                                             Mul_Clamp,
                                                                             ConvFwd1x1P0,
                                                                             16>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            Empty_Tuple,
+                                                                            NHWGK,
                                                                             Empty_Tuple,
                                                                             Mul_Clamp,
                                                                             ConvFwd1x1S1P0,
@@ -43,10 +52,10 @@ void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
 
 void add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                              GNHWC,
+                                                              NHWGC,
                                                               GKYXC,
                                                               Empty_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                               int8_t,
                                                               int8_t,
                                                               Empty_Tuple,
@@ -56,19 +65,28 @@ void add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances(
                                                               Relu_Mul_Clamp>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            Empty_Tuple,
+                                                                            NHWGK,
                                                                             Empty_Tuple,
                                                                             Relu_Mul_Clamp,
                                                                             ConvFwdDefault,
                                                                             16>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            Empty_Tuple,
+                                                                            NHWGK,
                                                                             Empty_Tuple,
                                                                             Relu_Mul_Clamp,
                                                                             ConvFwd1x1P0,
                                                                             16>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
+                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
+                                                                            GKYXC,
+                                                                            Empty_Tuple,
+                                                                            NHWGK,
                                                                             Empty_Tuple,
                                                                             Relu_Mul_Clamp,
                                                                             ConvFwd1x1S1P0,
-- 
GitLab


From 8b9cbba823024e9c2085ab54c5ac6f849e0c80e6 Mon Sep 17 00:00:00 2001
From: zjing14 <zhangjing14@gmail.com>
Date: Mon, 24 Apr 2023 08:07:39 -0500
Subject: [PATCH 27/71] reduce inital number for half_t splitk (#685)

---
 profiler/include/profiler/profile_gemm_splitk_impl.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/profiler/include/profiler/profile_gemm_splitk_impl.hpp b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
index e5d5f8765..233fb15c0 100644
--- a/profiler/include/profiler/profile_gemm_splitk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
@@ -72,8 +72,8 @@ bool profile_gemm_splitk_impl(int do_verification,
     {
     case 0: break;
     case 1:
-        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{0, 1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-1, 1});
         break;
     default:
         a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-- 
GitLab


From 8bb2bb4a051527bb11d0839f73ee0e82d5988708 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Mon, 24 Apr 2023 22:43:36 +0200
Subject: [PATCH 28/71] Grouped Gemm + SplitK + simplified Kernel Args (#669)

* simplify karg in device/grid split-k op

* fix mk_kn_mn instances

* add more instances

* B2C with 3D grid for KSplit

* Remove unused code.

* Use default B2C (3D grid) in grid gemm v2r4r2.

* Device gemm splitk use B2C map.

* Device GroupedGemmXdlSplitKCShuffle

* Example for GroupedGemm Xdl SplitK

* Introduce Device GroupedGemmSplitK

* Fix updating kbatch size.

* Add instance mk-nk-mn

* Enable set kbatch in profiler.

* Add GGemmSplitK mk-kn-mn instances

* Add more instances & split into multiple files.

* minor fix

* tuning

* clean

* disabled failed instances

* use pipe v2

* Ignore arg on not supported arch.

* fix warning

---------

Co-authored-by: carlushuang <carlus.huang@amd.com>
Co-authored-by: Adam Osewski <aosewski@amd.com>
Co-authored-by: zjing14 <zhangjing14@gmail.com>
Co-authored-by: Jing Zhang <jizhan@amd.com>
Co-authored-by: root <root@ctr-ubbsmc15.amd.com>
---
 example/15_grouped_gemm/CMakeLists.txt        |   4 +-
 .../grouped_gemm_xdl_splitk_fp16.cpp          |  97 +++
 .../run_grouped_gemm_example.inc              |   1 +
 .../gpu/device/device_grouped_gemm.hpp        |   2 +-
 .../gpu/device/device_grouped_gemm_splitk.hpp |  39 ++
 .../impl/device_gemm_xdl_splitk_c_shuffle.hpp |  20 +-
 ...evice_grouped_gemm_xdl_splitk_cshuffle.hpp | 612 ++++++++++++++++++
 .../gpu/grid/block_to_ctile_map.hpp           |  48 ++
 .../gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp  |  87 ++-
 .../gpu/grouped_gemm.hpp                      |  58 ++
 .../gpu/grouped_gemm/CMakeLists.txt           |   4 +
 ...l_splitk_f16_f16_f16_mk_kn_mn_instance.cpp |  80 +++
 ...16_f16_f16_mk_kn_mn_irregular_instance.cpp |  87 +++
 ...l_splitk_f16_f16_f16_mk_nk_mn_instance.cpp |  75 +++
 ...16_f16_f16_mk_nk_mn_irregular_instance.cpp |  81 +++
 .../profiler/profile_grouped_gemm_impl.hpp    |  31 +-
 profiler/src/profile_grouped_gemm.cpp         |  43 +-
 17 files changed, 1329 insertions(+), 40 deletions(-)
 create mode 100644 example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp

diff --git a/example/15_grouped_gemm/CMakeLists.txt b/example/15_grouped_gemm/CMakeLists.txt
index e3080c4a9..9df256c38 100644
--- a/example/15_grouped_gemm/CMakeLists.txt
+++ b/example/15_grouped_gemm/CMakeLists.txt
@@ -5,6 +5,7 @@ add_example_executable(example_grouped_gemm_xdl_fp16 grouped_gemm_xdl_fp16.cpp)
 add_example_executable(example_grouped_gemm_xdl_bfp16 grouped_gemm_xdl_bfp16.cpp)
 add_example_executable(example_grouped_gemm_xdl_int8 grouped_gemm_xdl_int8.cpp)
 add_example_executable(example_grouped_gemm_multiple_d_dl_fp16 grouped_gemm_multiple_d_dl_fp16.cpp)
+add_example_executable(example_grouped_gemm_xdl_splitk_fp16 grouped_gemm_xdl_splitk_fp16.cpp)
 
 
 add_dependencies(example_grouped_gemm_xdl
@@ -12,7 +13,8 @@ add_dependencies(example_grouped_gemm_xdl
                  example_grouped_gemm_xdl_fp16
                  example_grouped_gemm_xdl_bfp16
                  example_grouped_gemm_xdl_int8
-                 example_grouped_gemm_multiple_d_dl_fp16)
+                 example_grouped_gemm_multiple_d_dl_fp16
+                 example_grouped_gemm_xdl_splitk_fp16)
 
 if(USE_BITINT_EXTENSION_INT4)
   add_example_executable(example_grouped_gemm_xdl_int4 grouped_gemm_xdl_int4.cpp)
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
new file mode 100644
index 000000000..a89937b2e
--- /dev/null
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemmXdlSplitKCShuffle
+    // clang-format off
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+#include "run_grouped_gemm_example.inc"
+
+int main(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    problem_size.group_count = 16;
+
+    problem_size.Ms = {
+        167, 183, 177, 181, 153, 139, 156, 173, 163, 150, 204, 184, 168, 156, 168, 148};
+
+    for(int i = 0; i < problem_size.group_count; i++)
+    {
+        problem_size.Ns.push_back(768);
+        problem_size.Ks.push_back(4608);
+
+        problem_size.stride_As.push_back(problem_size.Ks[i]);
+        problem_size.stride_Bs.push_back(problem_size.Ks[i]);
+        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
+    }
+
+    if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        exit(0);
+    }
+
+    return !run_grouped_gemm(problem_size, config);
+}
diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc
index 324e17728..bceff29b6 100644
--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
@@ -147,6 +147,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
 #else
         a_tensors_device[i]->ToDevice(a_tensors[i].mData.data());
         b_tensors_device[i]->ToDevice(b_tensors[i].mData.data());
+	c_tensors_device[i]->SetZero();
 #endif
 
         p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
index 181ee4b42..4b1106c12 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
@@ -31,7 +31,7 @@ struct DeviceGroupedGemm : public BaseOperator
 {
     static constexpr index_t NumDTensor = DsDataType::Size();
 
-    static_assert(DsLayout::Size() == DsDataType::Size(), "wrong! inconsisiten NumDTensor");
+    static_assert(DsLayout::Size() == DsDataType::Size(), "wrong! inconsistent NumDTensor");
 
     virtual std::unique_ptr<BaseArgument>
     MakeArgumentPointer(std::vector<const void*>& p_a,
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp
new file mode 100644
index 000000000..06d180d30
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp
@@ -0,0 +1,39 @@
+#pragma once
+#include <iostream>
+#include <vector>
+
+#include "device_grouped_gemm.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGroupedGemmSplitK : public DeviceGroupedGemm<ALayout,
+                                                          BLayout,
+                                                          DsLayout,
+                                                          ELayout,
+                                                          ADataType,
+                                                          BDataType,
+                                                          DsDataType,
+                                                          EDataType,
+                                                          AElementwiseOperation,
+                                                          BElementwiseOperation,
+                                                          CElementwiseOperation>
+{
+    virtual void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
index 1f08cec67..776f96e8e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -114,7 +114,8 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
         CBlockTransferScalarPerVector_NWaveNPerXDL,
         CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument              = typename GridwiseGemm::Argument;
+    using DefaultBlock2CTileMap = typename GridwiseGemm::DefaultBlock2CTileMap;
 
     // Invoker
     struct Invoker : public BaseInvoker
@@ -138,8 +139,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                     "setting");
             }
 
+            const auto b2c_map = DefaultBlock2CTileMap{};
             index_t gdx, gdy, gdz;
-            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(karg);
+            std::tie(gdx, gdy, gdz) = b2c_map.CalculateGridSize(karg.M, karg.N, karg.k_batch);
             const auto K0           = karg.K0;
 
             const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
@@ -152,7 +154,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                         hipMemset(karg.p_c_grid, 0, karg.M * karg.N * sizeof(CDataType)));
 
                 ave_time = launch_and_time_kernel(
-                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg);
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg, b2c_map);
             };
 
             if(has_main_k0_block_loop)
@@ -162,7 +164,8 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                     const auto kernel =
                         kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
                                                              true,
-                                                             InMemoryDataOperationEnum::Set>;
+                                                             InMemoryDataOperationEnum::Set,
+                                                             DefaultBlock2CTileMap>;
 
                     Run(kernel);
                 }
@@ -171,7 +174,8 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                     const auto kernel =
                         kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
                                                              true,
-                                                             InMemoryDataOperationEnum::AtomicAdd>;
+                                                             InMemoryDataOperationEnum::AtomicAdd,
+                                                             DefaultBlock2CTileMap>;
 
                     Run(kernel);
                 }
@@ -183,7 +187,8 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                     const auto kernel =
                         kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
                                                              false,
-                                                             InMemoryDataOperationEnum::Set>;
+                                                             InMemoryDataOperationEnum::Set,
+                                                             DefaultBlock2CTileMap>;
 
                     Run(kernel);
                 }
@@ -192,7 +197,8 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                     const auto kernel =
                         kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
                                                              false,
-                                                             InMemoryDataOperationEnum::AtomicAdd>;
+                                                             InMemoryDataOperationEnum::AtomicAdd,
+                                                             DefaultBlock2CTileMap>;
 
                     Run(kernel);
                 }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
new file mode 100644
index 000000000..26a4319ea
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -0,0 +1,612 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/ck.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/tuple.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename GemmDesc,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_grouped_gemm_xdl_splitk(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+                                       const index_t group_count)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
+    __shared__ uint8_t p_shared[shared_size];
+
+    const index_t block_id = get_block_1d_id();
+    const auto gemm_desc_ptr =
+        reinterpret_cast<const GemmDesc*>(cast_pointer_to_generic_address_space(gemm_descs_const));
+
+    index_t left     = 0;
+    index_t right    = group_count;
+    index_t group_id = index_t((left + right) / 2);
+    while((!(block_id >= gemm_desc_ptr[group_id].block_start_ &&
+             block_id < gemm_desc_ptr[group_id].block_end_)) &&
+          left <= right)
+    {
+        if(block_id < gemm_desc_ptr[group_id].block_start_)
+        {
+            right = group_id;
+        }
+        else
+        {
+            left = group_id;
+        }
+        group_id = index_t((left + right) / 2);
+    }
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
+        gemm_desc_ptr[group_id].karg_,
+        static_cast<void*>(p_shared),
+        gemm_desc_ptr[group_id].block_2_ctile_map_);
+#else
+    ignore = gemm_descs_const;
+    ignore = group_count;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          ck::index_t NumPrefetch,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t KPerBlock,
+          ck::index_t AK1,
+          ck::index_t BK1,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler(),
+          // Current implementation does not support multiple D fusions.
+          enable_if_t<AK1 == BK1 && is_same_v<DsLayout, ck::Tuple<>> &&
+                          is_same_v<DsDataType, ck::Tuple<>>,
+                      bool> = false>
+struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayout,
+                                                                           BLayout,
+                                                                           DsLayout,
+                                                                           ELayout,
+                                                                           ADataType,
+                                                                           BDataType,
+                                                                           DsDataType,
+                                                                           EDataType,
+                                                                           AElementwiseOperation,
+                                                                           BElementwiseOperation,
+                                                                           CDEElementwiseOperation>
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static_assert(KPerBlock % AK1 == 0);
+    static constexpr index_t K0PerBlock = KPerBlock / AK1;
+
+    using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        EDataType,
+        ALayout,
+        BLayout,
+        ELayout,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        GemmSpec,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXDL,
+        NPerXDL,
+        AK1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
+
+    using CGridDesc_M_N = typename GridwiseGemm::CGridDesc_M_N;
+    using Block2ETileMapKSplit =
+        BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>;
+    // Block2CTileMap configuration parameter.
+    static constexpr index_t B2E_M01 = 8;
+    using GroupedGemmBlock2ETileMap  = OffsettedBlockToCTileMap<Block2ETileMapKSplit>;
+    using KernelArgument             = typename GridwiseGemm::Argument;
+
+    struct GemmTransKernelArg
+    {
+        KernelArgument karg_;
+        GroupedGemmBlock2ETileMap block_2_ctile_map_;
+        index_t block_start_, block_end_;
+
+        GemmTransKernelArg() = default;
+        GemmTransKernelArg(KernelArgument&& karg,
+                           GroupedGemmBlock2ETileMap&& b2c_map,
+                           index_t block_start,
+                           index_t block_end)
+            : karg_{karg},
+              block_2_ctile_map_{b2c_map},
+              block_start_{block_start},
+              block_end_{block_end}
+        {
+        }
+    };
+
+    static constexpr index_t DefaultKBatch = 1;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+
+        Argument(std::vector<const void*>& p_As,
+                 std::vector<const void*>& p_Bs,
+                 std::vector<void*>& p_Es,
+                 std::vector<GemmDesc>& gemm_descs)
+            : Argument(p_As, p_Bs, p_Es, gemm_descs, DefaultKBatch)
+        {
+            // TODO: use occupancy api to calculate appropriate batch size.
+        }
+
+        Argument(std::vector<const void*>& p_As,
+                 std::vector<const void*>& p_Bs,
+                 std::vector<void*>& p_Es,
+                 std::vector<GemmDesc>& gemm_descs,
+                 index_t kbatch)
+            : K_BATCH{kbatch}
+        {
+            grid_size_   = 0;
+            group_count_ = ck::type_convert<ck::index_t>(gemm_descs.size());
+
+            if(!(group_count_ == ck::type_convert<ck::index_t>(p_As.size()) &&
+                 group_count_ == ck::type_convert<ck::index_t>(p_Bs.size()) &&
+                 group_count_ == ck::type_convert<ck::index_t>(p_Es.size())))
+            {
+                throw std::runtime_error("wrong! group_count_ != p_As/b/c.size");
+            }
+
+            gemm_kernel_args_.reserve(group_count_);
+
+            skipped_group_count_ = 0;
+
+            for(std::size_t i = 0; i < gemm_descs.size(); ++i)
+            {
+                const index_t M = gemm_descs[i].M_;
+                const index_t N = gemm_descs[i].N_;
+                const index_t K = gemm_descs[i].K_;
+
+                if(M == 0)
+                {
+                    skipped_group_count_++;
+                    continue;
+                }
+
+                const index_t stride_a = gemm_descs[i].stride_A_;
+                const index_t stride_b = gemm_descs[i].stride_B_;
+                const index_t stride_c = gemm_descs[i].stride_C_;
+
+                const index_t m_padded = GridwiseGemm::CalculateMPadded(M);
+                const index_t n_padded = GridwiseGemm::CalculateNPadded(N);
+                const index_t k_padded = GridwiseGemm::CalculateKPadded(K, K_BATCH);
+                const index_t k0       = GridwiseGemm::CalculateK0(K, K_BATCH);
+
+                const auto c_grid_desc_m_n =
+                    GridwiseGemm::MakeCGridDescriptor_M_N(M, N, m_padded, n_padded, stride_c);
+
+                const auto local_b2c_tile_map =
+                    Block2ETileMapKSplit{c_grid_desc_m_n, B2E_M01, K_BATCH};
+                const index_t grid_size_grp = local_b2c_tile_map.CalculateGridSize(c_grid_desc_m_n);
+
+                const index_t block_start = grid_size_;
+                const index_t block_end   = grid_size_ + grid_size_grp;
+
+                grid_size_ += grid_size_grp;
+
+                // block-to-e-tile map
+                auto grouped_block_2_ctile_map =
+                    GroupedGemmBlock2ETileMap(local_b2c_tile_map, block_start);
+
+                auto karg = KernelArgument{type_convert<const ADataType*>(p_As[i]),
+                                           type_convert<const BDataType*>(p_Bs[i]),
+                                           type_convert<EDataType*>(p_Es[i]),
+                                           M,
+                                           N,
+                                           K,
+                                           stride_a,
+                                           stride_b,
+                                           stride_c,
+                                           m_padded,
+                                           n_padded,
+                                           k_padded,
+                                           k0,
+                                           K_BATCH};
+
+                gemm_kernel_args_.emplace_back(
+                    std::move(karg), std::move(grouped_block_2_ctile_map), block_start, block_end);
+            }
+        }
+
+        /**
+         * @brief      Recalculate group grid size for all gemms and update B2C maps.
+         *
+         * @param[in]  kbatch  The new splitK parameter value.
+         */
+        void UpdateKBatch(index_t kbatch)
+        {
+            K_BATCH    = kbatch;
+            grid_size_ = 0;
+
+            for(std::size_t i = 0; i < gemm_kernel_args_.size(); ++i)
+            {
+
+                auto& karg = gemm_kernel_args_[i].karg_;
+
+                const index_t k_padded = GridwiseGemm::CalculateKPadded(karg.K, K_BATCH);
+                const index_t k0       = GridwiseGemm::CalculateK0(karg.K, K_BATCH);
+
+                const auto c_grid_desc_m_n = GridwiseGemm::MakeCGridDescriptor_M_N(
+                    karg.M, karg.N, karg.MPadded, karg.NPadded, karg.StrideC);
+
+                const auto local_b2c_tile_map =
+                    Block2ETileMapKSplit{c_grid_desc_m_n, B2E_M01, K_BATCH};
+                const index_t grid_size_grp = local_b2c_tile_map.CalculateGridSize(c_grid_desc_m_n);
+
+                const index_t block_start = grid_size_;
+                const index_t block_end   = grid_size_ + grid_size_grp;
+
+                grid_size_ += grid_size_grp;
+
+                // block-to-e-tile map
+                auto grouped_block_2_ctile_map =
+                    GroupedGemmBlock2ETileMap(local_b2c_tile_map, block_start);
+
+                karg.KPadded                            = k_padded;
+                karg.K0                                 = k0;
+                karg.k_batch                            = K_BATCH;
+                gemm_kernel_args_[i].block_2_ctile_map_ = grouped_block_2_ctile_map;
+                gemm_kernel_args_[i].block_start_       = block_start;
+                gemm_kernel_args_[i].block_end_         = block_end;
+            }
+        }
+
+        //  private:
+        index_t K_BATCH;
+        index_t group_count_;
+        index_t skipped_group_count_;
+
+        std::vector<GemmTransKernelArg> gemm_kernel_args_;
+        index_t grid_size_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            index_t K0                       = arg.gemm_kernel_args_[0].karg_.K0;
+            bool all_have_kbatch_gt_one      = arg.gemm_kernel_args_[0].karg_.k_batch > 1;
+            bool all_have_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
+
+            for(std::size_t i = 0; i < arg.gemm_kernel_args_.size(); ++i)
+            {
+                const auto& karg = arg.gemm_kernel_args_[i].karg_;
+                if(stream_config.log_level_ > 0)
+                {
+                    karg.Print();
+                }
+
+                auto kbatch = karg.k_batch;
+
+                if(!GridwiseGemm::CheckValidity(karg))
+                {
+                    std::ostringstream err;
+                    err << "Group id: " << i << " has invalid GridwiseGemm settings!" << __FILE__
+                        << ":" << __LINE__ << ", in function: " << __func__;
+                    throw std::runtime_error(err.str());
+                }
+
+                K0 = karg.K0;
+                bool not_all_have_main_k0_block_loop_same =
+                    all_have_main_k0_block_loop xor GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
+                bool not_all_have_kbatch_value_same = all_have_kbatch_gt_one xor (kbatch > 1);
+
+                if(not_all_have_main_k0_block_loop_same)
+                {
+                    std::ostringstream err;
+                    err << "Not all gemms have same value for main_k0_block_loop! in " << __FILE__
+                        << ":" << __LINE__ << ", in function: " << __func__;
+                    throw std::runtime_error(err.str());
+                }
+
+                if(not_all_have_kbatch_value_same)
+                {
+                    std::ostringstream err;
+                    err << "Not all gemms have same kbatch value (=1 or >1)! "
+                        << "group [" << i << "], kbatch: " << kbatch
+                        << ", group [0], kbatch: " << arg.gemm_kernel_args_[0].karg_.k_batch
+                        << " in " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+                    throw std::runtime_error(err.str());
+                }
+            }
+
+            hip_check_error(hipMemcpy(arg.p_workspace_,
+                                      arg.gemm_kernel_args_.data(),
+                                      arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
+                                      hipMemcpyHostToDevice));
+
+            float ave_time = 0;
+
+            const auto Run = [&](const auto& kernel) {
+                if(all_have_kbatch_gt_one)
+                {
+                    for(const auto& trans_arg : arg.gemm_kernel_args_)
+                    {
+                        const auto& karg = trans_arg.karg_;
+                        hip_check_error(
+                            hipMemset(karg.p_c_grid, 0, karg.M * karg.N * sizeof(EDataType)));
+                    }
+                }
+
+                ave_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(arg.grid_size_),
+                                           dim3(BlockSize),
+                                           0,
+                                           cast_pointer_to_constant_address_space(arg.p_workspace_),
+                                           arg.gemm_kernel_args_.size());
+            };
+
+            if(all_have_main_k0_block_loop)
+            {
+                if(all_have_kbatch_gt_one)
+                {
+                    const auto kernel =
+                        kernel_grouped_gemm_xdl_splitk<GridwiseGemm,
+                                                       GemmTransKernelArg,
+                                                       true,
+                                                       InMemoryDataOperationEnum::AtomicAdd>;
+
+                    Run(kernel);
+                }
+                else
+                {
+                    const auto kernel =
+                        kernel_grouped_gemm_xdl_splitk<GridwiseGemm,
+                                                       GemmTransKernelArg,
+                                                       true,
+                                                       InMemoryDataOperationEnum::Set>;
+
+                    Run(kernel);
+                }
+            }
+            else
+            {
+                if(all_have_kbatch_gt_one)
+                {
+                    const auto kernel =
+                        kernel_grouped_gemm_xdl_splitk<GridwiseGemm,
+                                                       GemmTransKernelArg,
+                                                       false,
+                                                       InMemoryDataOperationEnum::AtomicAdd>;
+
+                    Run(kernel);
+                }
+                else
+                {
+                    const auto kernel =
+                        kernel_grouped_gemm_xdl_splitk<GridwiseGemm,
+                                                       GemmTransKernelArg,
+                                                       false,
+                                                       InMemoryDataOperationEnum::Set>;
+
+                    Run(kernel);
+                }
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if((ck::type_convert<ck::index_t>(arg.gemm_kernel_args_.size()) +
+            arg.skipped_group_count_) != arg.group_count_)
+        {
+            return false;
+        }
+
+        bool supported = true;
+        for(std::size_t i = 0; i < arg.gemm_kernel_args_.size(); ++i)
+        {
+            const auto& a        = arg.gemm_kernel_args_[i].karg_;
+            bool group_arg_valid = GridwiseGemm::CheckValidity(a);
+#if DEBUG_LOG
+            if(not group_arg_valid)
+            {
+                std::cout << "[" << __func__ << "] group id: " << i << " is not supported!\n";
+                a.Print();
+            }
+#endif // DEBUG_LOG
+            supported &= group_arg_valid;
+        }
+        return supported;
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(std::vector<const void*>& p_As,
+                             std::vector<const void*>& p_Bs,
+                             std::vector<std::array<const void*, NumDTensor>>&,
+                             std::vector<void*>& p_Es,
+                             std::vector<GemmDesc> gemm_descs,
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CDEElementwiseOperation)
+    {
+        return Argument{p_As, p_Bs, p_Es, gemm_descs};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::vector<const void*>& p_As,
+                        std::vector<const void*>& p_Bs,
+                        std::vector<std::array<const void*, NumDTensor>>&,
+                        std::vector<void*>& p_Es,
+                        std::vector<GemmDesc>& gemm_descs,
+                        AElementwiseOperation,
+                        BElementwiseOperation,
+                        CDEElementwiseOperation) override
+    {
+        return std::make_unique<Argument>(p_As, p_Bs, p_Es, gemm_descs);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedGemm_XdlSplitK"
+            << "<"
+            << std::string(ALayout::name)[0] << ","
+            << std::string(BLayout::name)[0] << ","
+            << std::string(ELayout::name)[0] << ","
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferSrcScalarPerVector << ", "
+            << CShuffleMXdlPerWavePerShuffle << ", "
+            << CShuffleNXdlPerWavePerShuffle << ", "
+            << getGemmSpecializationString(GemmSpec)
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+
+    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
+    {
+        return dynamic_cast<const Argument*>(p_arg)->gemm_kernel_args_.size() *
+               sizeof(GemmTransKernelArg);
+    }
+
+    static void SetKBatchSize(Argument& arg, index_t kbatch) { arg.UpdateKBatch(kbatch); }
+
+    // polymorphic
+    void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override
+    {
+        return SetKBatchSize(*dynamic_cast<Argument*>(p_arg), kbatch);
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
index fe4dce2b9..9bd860f39 100644
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -587,4 +587,52 @@ struct OffsettedBlockToCTileMap
     index_t block_start_;
 };
 
+/**
+ * @brief      Simple tile mapping which creates 3D grid of block of threads.
+ *
+ * @paragraph  Description
+ *             This Block-to-C-tile-map creates a 3D grid (n_blocks, m_blocks, z_blocks) of thread
+ *             blocks. The first 2D are regular 2D tiles created by division of output GEMM
+ *             dimenions by corresponding tile size. The third dimension (Z) is a k-split dimension,
+ *             which denotes the number of blocks we use to divide work on GEMM K dimension onto.
+ *
+ * @tparam     MPerBlock  Output block tile size in M dimension.
+ * @tparam     NPerBlock  Output block tile size in N dimension.
+ */
+template <index_t MPerBlock, index_t NPerBlock>
+struct BlockToCTileMap_3DGrid_KSplit
+{
+
+    __host__ __device__ BlockToCTileMap_3DGrid_KSplit() = default;
+
+    __host__ __device__ constexpr auto
+    CalculateGridSize(index_t M, index_t N, index_t k_split) const
+    {
+        // Create 3D grid
+        const auto M0 = math::integer_divide_ceil(M, MPerBlock);
+        const auto N0 = math::integer_divide_ceil(N, NPerBlock);
+
+        return std::make_tuple(N0, M0, k_split);
+    }
+
+    template <typename TopIdx>
+    __device__ constexpr auto CalculateBottomIndex(const TopIdx&) const
+    {
+        return make_tuple(blockIdx.z, blockIdx.y, blockIdx.x);
+    }
+
+    template <typename CTileIdx, typename CTileDim>
+    __host__ __device__ bool ValidCTileIndex(const CTileIdx& /* c_tile_idx */,
+                                             const CTileDim& /* c_tile_dim */) const
+    {
+        return true; // always valid provided that user gets grid size from CalculateGridSize()
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const
+    {
+        return true;
+    }
+};
+
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
index 727f180e9..4a2a77ce0 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -15,16 +15,20 @@
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+
 namespace ck {
 
 template <typename GridwiseGemm,
           bool HasMainKBlockLoop,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation>
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename Block2CTileMap>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_v2r4r2_simplified(typename GridwiseGemm::Argument karg)
+        kernel_gemm_xdlops_v2r4r2_simplified(typename GridwiseGemm::Argument karg,
+                                             const Block2CTileMap& b2c_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
     constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
@@ -32,9 +36,10 @@ __global__ void
     __shared__ uint8_t p_shared[shared_size];
 
     GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
-        karg, static_cast<void*>(p_shared));
+        karg, static_cast<void*>(p_shared), b2c_map);
 #else
     ignore = karg;
+    ignore = b2c_map;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 
@@ -478,8 +483,21 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                        Number<CShuffleNRepeatPerShuffle * NWave * NPerXDL>{}));
     }
 
-    template <bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation>
-    __device__ static void Run(const Argument& karg, void* __restrict__ p_shared_block)
+    // return block_id to C matrix tile idx (m0, n0, k_split) mapping
+    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap()
+    {
+        return BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>();
+    }
+
+    using CGridDesc_M_N         = remove_cvref_t<decltype(MakeCGridDescriptor_M_N(1, 1, 1, 1, 1))>;
+    using DefaultBlock2CTileMap = remove_cvref_t<decltype(MakeDefaultBlock2CTileMap())>;
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              typename Block2CTileMap>
+    __device__ static void Run(const Argument& karg,
+                               void* __restrict__ p_shared_block,
+                               const Block2CTileMap& block_2_ctile_map)
     {
         const FloatAB* p_a_grid          = karg.p_a_grid;
         const FloatAB* p_b_grid          = karg.p_b_grid;
@@ -504,11 +522,21 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
 
-        const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1);
+        // divide block work by [KBatch, M, N]
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
 
-        const index_t block_m_id = __builtin_amdgcn_readfirstlane(blockIdx.y);
-        const index_t block_n_id = __builtin_amdgcn_readfirstlane(blockIdx.x);
-        const index_t k_batch_id = __builtin_amdgcn_readfirstlane(blockIdx.z);
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I2]);
+        const index_t k_batch_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
 
         // HACK: this force m/n_block_data_idx_on_grid into SGPR
         const index_t m_block_data_idx_on_grid =
@@ -651,6 +679,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
         //       register
         // sanity check
+#if 1
         auto blockwise_gemm =
             BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
                                                                 FloatAB,
@@ -662,6 +691,20 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                                                                 MRepeat,
                                                                 NRepeat,
                                                                 K1>{};
+#else
+        auto blockwise_gemm = BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            decltype(a_k0_m_k1_block_desc),
+            decltype(b_k0_n_k1_block_desc),
+            MPerXDL,
+            NPerXDL,
+            MRepeat,
+            NRepeat,
+            K1>{};
+
+#endif
 
         auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
 
@@ -680,6 +723,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
             p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());
 
+#if 0
         // preload data into LDS
         {
             a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf);
@@ -725,6 +769,31 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
 
             blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
         }
+#else
+        // gridwise GEMM pipeline
+        const auto gridwise_gemm_pipeline =
+            GridwiseGemmPipeline_Selector<PipelineVersion::v2, 1, LoopScheduler::Default>();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_b_k0_m_k1_grid_desc.GetLength(I1) * a_b_k0_m_k1_grid_desc.GetLength(I3)) /
+            (K0PerBlock * K1));
+
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_b_k0_m_k1_grid_desc,
+                                                               a_b_k0_m_k1_block_desc,
+                                                               a_blockwise_copy,
+                                                               a_grid_buf,
+                                                               a_block_buf,
+                                                               a_block_slice_copy_step,
+                                                               b_b_k0_n_k1_grid_desc,
+                                                               b_b_k0_n_k1_block_desc,
+                                                               b_blockwise_copy,
+                                                               b_grid_buf,
+                                                               b_block_buf,
+                                                               b_block_slice_copy_step,
+                                                               blockwise_gemm,
+                                                               c_thread_buf,
+                                                               num_k_block_main_loop);
+#endif
 
         // output: register to global memory
         {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
index fa7462dbd..e38dad165 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
@@ -68,6 +68,58 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances);
 
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
 template <typename ALayout,
           typename BLayout,
           typename ELayout,
@@ -109,11 +161,17 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                          is_same_v<ELayout, Row>)
             {
                 add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
+                    op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                               is_same_v<ELayout, Row>)
             {
                 add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances(
+                    op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
                               is_same_v<ELayout, Row>)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
index 82beb2ace..b973b70aa 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
@@ -3,4 +3,8 @@ add_instance_library(device_grouped_gemm_instance
    device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
    device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
    device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
+   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
+   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
+   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
+   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
new file mode 100644
index 000000000..764ec0619
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Empty_Tuple = ck::Tuple<>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// a[m, k] * b[k, n] = e[m, n]
+using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // Currently AK1 must equal BK1 !
+        // DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        // DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        // DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8>,
+        // DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        // DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,              8>,
+        // DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8>,
+        // DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 16,16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        // DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
new file mode 100644
index 000000000..0385b0fc0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Empty_Tuple = ck::Tuple<>;
+
+using PassThrough                    = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
+        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // Currently AK1 must equal BK1 !
+     // DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 16,16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+     // DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+     // DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 8, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,              8>,
+     // DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8>,
+        
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+//      DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+//      DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+//      DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>
+    // clang-format on
+    >;
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
new file mode 100644
index 000000000..cffb0fce1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Empty_Tuple = ck::Tuple<>;
+
+using PassThrough                 = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// a[m, k] * b[n, k] = e[m, n]
+using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+    // clang-format on
+    >;
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
new file mode 100644
index 000000000..5933ff61e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Empty_Tuple = ck::Tuple<>;
+
+using PassThrough                    = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
+        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+//      DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,    
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+//      DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   256,    32,   8,   8,   32,   32,    1,    4,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>
+    // clang-format on
+    >;
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
index 2c28850e6..23dca244d 100644
--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -8,6 +8,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp"
@@ -39,7 +40,8 @@ bool profile_grouped_gemm_impl(int do_verification,
                                const std::vector<int>& Ks,
                                const std::vector<int>& StrideAs,
                                const std::vector<int>& StrideBs,
-                               const std::vector<int>& StrideCs)
+                               const std::vector<int>& StrideCs,
+                               int kbatch = 1)
 {
 
     bool pass = true;
@@ -96,8 +98,6 @@ bool profile_grouped_gemm_impl(int do_verification,
             a_m_k[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
             b_k_n[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
         }
-
-        c_m_n_device_results[i].GenerateTensorValue(GeneratorTensor_0<CDataType>{}, num_thread);
     }
 
     using AElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -132,13 +132,12 @@ bool profile_grouped_gemm_impl(int do_verification,
             std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpaceSize()));
         b_device_buf.emplace_back(
             std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpaceSize()));
-
         c_device_buf.emplace_back(std::make_unique<DeviceMem>(
             sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpaceSize()));
 
         a_device_buf[i]->ToDevice(a_m_k[i].mData.data());
         b_device_buf[i]->ToDevice(b_k_n[i].mData.data());
-        c_device_buf[i]->ToDevice(c_m_n_device_results[i].mData.data());
+        c_device_buf[i]->SetZero();
 
         gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});
 
@@ -197,6 +196,28 @@ bool profile_grouped_gemm_impl(int do_verification,
         {
             std::string gemm_name = gemm_ptr->GetTypeString();
 
+            if(kbatch > 1)
+            {
+                using DeviceOpSplitK =
+                    ck::tensor_operation::device::DeviceGroupedGemmSplitK<ALayout,
+                                                                          BLayout,
+                                                                          ck::Tuple<>,
+                                                                          CLayout,
+                                                                          ADataType,
+                                                                          BDataType,
+                                                                          ck::Tuple<>,
+                                                                          CDataType,
+                                                                          AElementOp,
+                                                                          BElementOp,
+                                                                          CElementOp>;
+
+                if(dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get()) != nullptr)
+                {
+                    dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
+                        ->SetKBatchSize(argument_ptr.get(), kbatch);
+                }
+            }
+
             float ave_time =
                 invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp
index 871b2edfd..34647adab 100644
--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -52,20 +52,24 @@ std::vector<int> argToIntArray(char* input)
 
 int profile_grouped_gemm(int argc, char* argv[])
 {
-    if(!(argc == 14))
+    if(argc < 14)
     {
-        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
-        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
-        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
-        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
-        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
-        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
-        printf("arg4: verification (0: no; 1: yes)\n");
-        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg6: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: time kernel (0=n0, 1=yes)\n");
-        printf("arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
-               "64,64 64,64 128,128)\n");
+        std::cout
+            << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+            << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
+            << "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"
+            << "                     1: A[m, k] * B[n, k] = C[m, n];\n"
+            << "                     2: A[k, m] * B[k, n] = C[m, n];\n"
+            << "                     3: A[k, m] * B[n, k] = C[m, n])\n"
+            << "arg4: verification (0: no; 1: yes)\n"
+            << "arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+            << "arg6: print tensor value (0: no; 1: yes)\n"
+            << "arg7: time kernel (0=n0, 1=yes)\n"
+            << "arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
+               "64,64 64,64 128,128)\n"
+            << "arg15: kbatch value (default 4)\n"
+            << std::endl;
+
         exit(1);
     }
 
@@ -83,6 +87,7 @@ int profile_grouped_gemm(int argc, char* argv[])
     const auto StrideAs = argToIntArray(argv[11]);
     const auto StrideBs = argToIntArray(argv[12]);
     const auto StrideCs = argToIntArray(argv[13]);
+    const int kbatch    = argc == 15 ? std::stoi(argv[14]) : 1;
 
     if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
@@ -101,7 +106,8 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                    Ks,
                                                                                    StrideAs,
                                                                                    StrideBs,
-                                                                                   StrideCs);
+                                                                                   StrideCs,
+                                                                                   kbatch);
     }
     else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
@@ -120,7 +126,8 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                    Ks,
                                                                                    StrideAs,
                                                                                    StrideBs,
-                                                                                   StrideCs);
+                                                                                   StrideCs,
+                                                                                   kbatch);
     }
     else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
     {
@@ -139,7 +146,8 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                    Ks,
                                                                                    StrideAs,
                                                                                    StrideBs,
-                                                                                   StrideCs);
+                                                                                   StrideCs,
+                                                                                   kbatch);
     }
     else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
     {
@@ -158,7 +166,8 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                    Ks,
                                                                                    StrideAs,
                                                                                    StrideBs,
-                                                                                   StrideCs);
+                                                                                   StrideCs,
+                                                                                   kbatch);
     }
     else
     {
-- 
GitLab


From 7613c1d9b9dc612a5de79ab968c534ea58e7cbe4 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Wed, 26 Apr 2023 13:41:03 -0700
Subject: [PATCH 29/71] [CK] suppress unsafe buffer warn (#687)

incomplete fix from https://github.com/ROCmSoftwarePlatform/composable_kernel/pull/670

So it does not only happen in gtest but also in CK code:

We need to fix them as a quality improvement, but for now suppressing this warning in immediate releases:
http://compiler-ci.amd.com/blue/rest/organizations/jenkins/pipelines/compiler-psdb-amd-stg-open/runs/2540/nodes/282/steps/3202/log/?start=0

e.g.
```
[2023-04-26T17:26:31.524Z] /jenkins/workspace/compiler-psdb-amd-stg-open/Libs/MIOpen/deps_hip/cget/build/tmp-a3db5da587a64213bde99fb856db1b43/composable_kernel-0f98035df1cc5ba3e90ab03187e672b426a25b00/include/ck/utility/generic_memory_space_atomic.hpp:52:19: error: unsafe pointer arithmetic [-Werror,-Wunsafe-buffer-usage]
[2023-04-26T17:26:31.524Z]         atomicAdd(c_style_pointer_cast<float*>(p_dst) + 1, vx.template AsType<float>()[I1]);
[2023-04-26T17:26:31.524Z]                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
```
```
[2023-04-26T17:26:31.523Z] /jenkins/workspace/compiler-psdb-amd-stg-open/Libs/MIOpen/deps_hip/cget/build/tmp-a3db5da587a64213bde99fb856db1b43/composable_kernel-0f98035df1cc5ba3e90ab03187e672b426a25b00/include/ck/utility/amd_inline_asm.hpp:62:20: error: 'p_a_half2' is an unsafe pointer used for buffer access [-Werror,-Wunsafe-buffer-usage]
[2023-04-26T17:26:31.523Z]     const half2_t* p_a_half2  = c_style_pointer_cast<const half2_t*>(&a);
[2023-04-26T17:26:31.523Z]     ~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
```
---
 cmake/EnableCompilerWarnings.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/EnableCompilerWarnings.cmake b/cmake/EnableCompilerWarnings.cmake
index 87bcb08e8..369cd0b54 100644
--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -92,6 +92,7 @@ else()
                 -Wno-unused-command-line-argument
                 -Wno-weak-vtables
                 -Wno-covered-switch-default
+                -Wno-unsafe-buffer-usage
             )
         else()
             if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "GNU" AND ${COMPILER} MATCHES "CXX")
-- 
GitLab


From 54c90aae13622fa5cc9457af22bd55540a6f794b Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Thu, 27 Apr 2023 04:58:57 +0800
Subject: [PATCH 30/71] add vector load check (#680)

Co-authored-by: zjing14 <zhangjing14@gmail.com>
---
 .../device_gemm_multiple_d_wmma_cshuffle.hpp  | 89 ++++++++++++++++++-
 .../gpu/device/impl/device_gemm_wmma.hpp      | 71 ++++++++++++++-
 2 files changed, 158 insertions(+), 2 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
index b59bb2b30..750df31a3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
@@ -273,7 +273,10 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD<ALayout,
               N01_{N01},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
+              cde_element_op_{cde_element_op},
+              MRaw_{M},
+              NRaw_{N},
+              KRaw_{K}
         {
             a_grid_desc_k0_m_k1_ = DeviceOp::MakeAGridDescriptor_K0_M_K1(M, K, StrideA);
             b_grid_desc_k0_n_k1_ = DeviceOp::MakeBGridDescriptor_K0_N_K1(K, N, StrideB);
@@ -335,6 +338,11 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD<ALayout,
         AElementwiseOperation a_element_op_;
         BElementwiseOperation b_element_op_;
         CDEElementwiseOperation cde_element_op_;
+
+        // for checking vector load/store
+        index_t MRaw_;
+        index_t NRaw_;
+        index_t KRaw_;
     };
 
     // Invoker
@@ -488,6 +496,85 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD<ALayout,
         {
             return false;
         }
+        // check vector load/store
+        {
+            using Row = ck::tensor_layout::gemm::RowMajor;
+            using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+            // check vector load of A
+            if constexpr(is_same_v<ALayout, Row> && ABlockTransferSrcVectorDim == 2)
+            {
+                if(arg.KRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else if constexpr(is_same_v<ALayout, Col> && ABlockTransferSrcVectorDim == 1)
+            {
+                // FIXME: not rigorous
+                if(arg.MRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+
+            // check vector laod of B
+            if constexpr(is_same_v<BLayout, Col> && BBlockTransferSrcVectorDim == 2)
+            {
+                if(arg.KRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else if constexpr(is_same_v<BLayout, Row> && BBlockTransferSrcVectorDim == 1)
+            {
+                // FIXME: not rigorous
+                if(arg.NRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+
+            // check vector load of Ds
+            // only support RowMajor for now
+            bool all_valid = true;
+
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                if constexpr(!is_same_v<DLayout, Row>)
+                {
+                    all_valid = false;
+                }
+            });
+
+            if(!all_valid)
+            {
+                return false;
+            }
+
+            // check vector store of E
+            // only support RowMajor for now
+            if constexpr(is_same_v<ELayout, Row>)
+            {
+                if(arg.NRaw_ % CDEShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+        }
 
         return GridwiseOp::CheckValidity(arg.a_grid_desc_k0_m_k1_,
                                          arg.b_grid_desc_k0_n_k1_,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
index 9b0ff3f46..03ffcf8e5 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
@@ -239,7 +239,10 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm<ALayout,
               N01_{N01},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
-              c_element_op_{c_element_op}
+              c_element_op_{c_element_op},
+              MRaw_{M},
+              NRaw_{N},
+              KRaw_{K}
         {
             a_grid_desc_k0_m_k1_ =
                 DeviceGemmWmma_CShuffle::MakeAGridDescriptor_K0_M_K1(M, K, StrideA);
@@ -276,6 +279,10 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm<ALayout,
         AElementwiseOperation a_element_op_;
         BElementwiseOperation b_element_op_;
         CElementwiseOperation c_element_op_;
+        // for checking vector load/store
+        index_t MRaw_;
+        index_t NRaw_;
+        index_t KRaw_;
     };
 
     // Invoker
@@ -417,6 +424,68 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm<ALayout,
             return false;
         }
 
+        // check vector load/store
+        {
+            using Row = ck::tensor_layout::gemm::RowMajor;
+            using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+            // check vector load of A
+            if constexpr(is_same_v<ALayout, Row> && ABlockTransferSrcVectorDim == 2)
+            {
+                if(arg.KRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else if constexpr(is_same_v<ALayout, Col> && ABlockTransferSrcVectorDim == 1)
+            {
+                // FIXME: not rigorous
+                if(arg.MRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+
+            // check vector laod of B
+            if constexpr(is_same_v<BLayout, Col> && BBlockTransferSrcVectorDim == 2)
+            {
+                if(arg.KRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else if constexpr(is_same_v<BLayout, Row> && BBlockTransferSrcVectorDim == 1)
+            {
+                // FIXME: not rigorous
+                if(arg.NRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+
+            // check vector store of C
+            // only support RowMajor for now
+            if constexpr(is_same_v<CLayout, Row>)
+            {
+                if(arg.NRaw_ % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+        }
+
         return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
                                            arg.b_grid_desc_k0_n_k1_,
                                            arg.c_grid_desc_m_n_,
-- 
GitLab


From 4feebedd413212fcd455da4a2bfcb40e9e6cab5a Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 28 Apr 2023 16:22:59 -0700
Subject: [PATCH 31/71] Syncing up from internal repo to enable MI300. (#690)

* enable gfx940

* switch between intrinsic mfma routines on mi100/200 and mi300

* fix mfma_int8 on MI300

* disable 2 int8 examples on MI300

* Update cmake-ck-dev.sh

* restore gitignore file

* modify Jenkinsfile to the internal repo

---------

Co-authored-by: Jing Zhang <jizha@amd.com>
Co-authored-by: zjing14 <zhangjing14@gmail.com>
---
 example/31_batched_gemm_gemm/CMakeLists.txt   |  4 +-
 .../41_grouped_conv_conv_fwd/CMakeLists.txt   |  5 +-
 include/ck/ck.hpp                             | 18 +++---
 .../device_gemm_xdl_waveletmodel_cshuffle.hpp |  6 +-
 ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp |  6 +-
 ...tk_contraction_multiple_d_xdl_cshuffle.hpp |  6 +-
 ...ed_contraction_multiple_d_xdl_cshuffle.hpp |  6 +-
 .../device_batched_gemm_e_permute_xdl.hpp     |  3 +-
 .../device_batched_gemm_gemm_xdl_cshuffle.hpp |  6 +-
 .../impl/device_batched_gemm_multi_d_xdl.hpp  |  6 +-
 ...ultiple_d_gemm_multiple_d_xdl_cshuffle.hpp |  6 +-
 ...evice_batched_gemm_reduce_xdl_cshuffle.hpp |  3 +-
 ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp |  6 +-
 ...batched_gemm_softmax_gemm_xdl_cshuffle.hpp |  6 +-
 .../device/impl/device_batched_gemm_xdl.hpp   |  3 +-
 ...ce_contraction_multiple_d_xdl_cshuffle.hpp |  6 +-
 ...evice_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp |  3 +-
 .../impl/device_gemm_bias_e_permute_xdl.hpp   |  6 +-
 ...gemm_multiple_d_layernorm_xdl_cshuffle.hpp |  6 +-
 ...emm_multiple_d_multiple_r_xdl_cshuffle.hpp |  6 +-
 .../device_gemm_multiple_d_xdl_cshuffle.hpp   |  6 +-
 .../gpu/device/impl/device_gemm_xdl.hpp       |  2 +-
 .../device/impl/device_gemm_xdl_cshuffle.hpp  |  3 +-
 .../device_gemm_xdl_layernorm_cshuffle.hpp    |  3 +-
 ...ed_contraction_multiple_d_xdl_cshuffle.hpp |  6 +-
 ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp |  3 +-
 ...bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp |  3 +-
 ...fwd_multiple_d_multiple_r_xdl_cshuffle.hpp |  5 +-
 ...ouped_conv_fwd_multiple_d_xdl_cshuffle.hpp |  5 +-
 .../device/impl/device_grouped_gemm_xdl.hpp   |  3 +-
 ...e_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp |  3 +-
 .../gridwise_gemm_reduce_xdl_cshuffle_v1.hpp  |  3 +-
 .../grid/gridwise_gemm_xdl_cshuffle_v1.hpp    |  3 +-
 ...ridwise_gemm_xdl_layernorm_cshuffle_v1.hpp |  3 +-
 .../grid/gridwise_gemm_xdlops_bwd_weight.hpp  |  3 +-
 .../gridwise_gemm_xdlops_skip_b_lds_v1.hpp    |  3 +-
 .../gpu/grid/gridwise_gemm_xdlops_v2r3.hpp    |  3 +-
 .../gpu/grid/gridwise_gemm_xdlops_v2r4.hpp    |  3 +-
 .../gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp  |  3 +-
 .../gpu/grid/gridwise_gemm_xdlops_v3r1.hpp    |  3 +-
 .../gpu/grid/gridwise_gemm_xdlops_v3r2.hpp    |  3 +-
 .../gpu/grid/gridwise_gemm_xdlops_v3r3.hpp    |  3 +-
 .../tensor_operation/gpu/warp/xdlops_gemm.hpp | 60 ++++++++++++++++++-
 include/ck/utility/amd_xdlops.hpp             | 40 ++++++++++++-
 include/ck/utility/data_type.hpp              |  2 +
 script/cmake-ck-dev.sh                        |  3 +-
 script/cmake-ck-release.sh                    |  3 +-
 47 files changed, 228 insertions(+), 72 deletions(-)

diff --git a/example/31_batched_gemm_gemm/CMakeLists.txt b/example/31_batched_gemm_gemm/CMakeLists.txt
index d79248251..ad40c96b4 100644
--- a/example/31_batched_gemm_gemm/CMakeLists.txt
+++ b/example/31_batched_gemm_gemm/CMakeLists.txt
@@ -1,7 +1,9 @@
 add_example_executable(example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp)
 add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp)
 add_example_executable(example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp)
-add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
+if(NOT GPU_TARGETS MATCHES "gfx940")
+	add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
+endif()
 
 if(USE_BITINT_EXTENSION_INT4)
 add_example_executable(example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp)
diff --git a/example/41_grouped_conv_conv_fwd/CMakeLists.txt b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
index 9cb30f617..4eb79371a 100644
--- a/example/41_grouped_conv_conv_fwd/CMakeLists.txt
+++ b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
@@ -1,8 +1,9 @@
 add_example_executable(example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp)
 add_example_executable(example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp)
 add_example_executable(example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp)
-add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
-
+if(NOT GPU_TARGETS MATCHES "gfx940")
+	add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
+endif()
 if(USE_BITINT_EXTENSION_INT4)
 add_example_executable(example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp)
 endif(USE_BITINT_EXTENSION_INT4)
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 036ca24a4..cb20ea249 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -31,7 +31,7 @@
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_BUFFER_RESOURCE_3RD_DWORD -1
 #elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
-    defined(__gfx90a__) // for GPU code
+    defined(__gfx90a__) || defined(__gfx940__) // for GPU code
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
 #elif defined(__gfx1030__) // for GPU code
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
@@ -43,8 +43,8 @@
 #ifndef __HIP_DEVICE_COMPILE__                   // for host code, define nothing
 #elif defined(__gfx803__) || defined(__gfx900__) // for GPU code
 #define CK_USE_AMD_V_MAC_F32
-#elif defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx1030__) // for GPU code
+#elif defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) || \
+    defined(__gfx940__) // for GPU code
 #define CK_USE_AMD_V_FMAC_F32
 #define CK_USE_AMD_V_DOT2_F32_F16
 #define CK_USE_AMD_V_DOT4_I32_I8
@@ -53,14 +53,18 @@
 // MFMA instruction
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_USE_AMD_MFMA
-#elif defined(__gfx908__) || defined(__gfx90a__) // for GPU code
+#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) // for GPU code
 #define CK_USE_AMD_MFMA
 #endif
 
-#if defined(__gfx90a__)
+#if(defined(__gfx90a__) || defined(__gfx940__))
 #define CK_USE_AMD_MFMA_BF16_1K_OP
 #endif
 
+#if defined(__gfx940__)
+#define CK_USE_AMD_MFMA_GFX940
+#endif
+
 // WMMA instruction
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_USE_AMD_WMMA
@@ -80,13 +84,13 @@
 // buffer atomic add: floating point
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
-#elif defined(__gfx908__) || defined(__gfx90a__) // for GPU code
+#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
 #else // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
 #endif
 
-#if defined(__gfx90a__) // for GPU code
+#if(defined(__gfx90a__) || defined(__gfx940__)) // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
 #else
 #define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
index d985d0f92..af38f1425 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -47,7 +47,8 @@ __global__ void
                 e_grid_desc_mblock_mperblock_nblock_nperblock,
             const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
@@ -416,7 +417,8 @@ struct DeviceGemm_Xdl_WaveletModel_CShuffle : public DeviceGemm<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index 70f3d0277..0b1db2846 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -43,7 +43,8 @@ __global__ void
             const B1ElementwiseOperation b1_element_op,
             const CElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t block_id = get_block_1d_id();
@@ -678,7 +679,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
index 8eab1cdee..70990e795 100644
--- a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
@@ -56,7 +56,8 @@ __global__ void
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
             const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t num_blocks_per_batch =
@@ -938,7 +939,8 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
index 2237ad944..9bf8f5ccd 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
@@ -56,7 +56,8 @@ __global__ void
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
             const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t num_blocks_per_batch =
@@ -839,7 +840,8 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
index 01f5e17d9..201844586 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
@@ -74,7 +74,8 @@ __global__ void
                                           const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
                                           const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
index 3b87e5633..20e9920d9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
@@ -60,7 +60,8 @@ __global__ void
             const index_t batch_count,
             const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -588,7 +589,8 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
index e34c19bdf..0df346094 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
@@ -83,7 +83,8 @@ __global__ void
                                 const Block2ETileMap block_2_etile_map)
 {
 
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
@@ -579,7 +580,8 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
index 4fc8e69d2..196dc86da 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
@@ -68,7 +68,8 @@ __global__ void
             const index_t batch_count,
             const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -804,7 +805,8 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
index 080e26ea8..ef9b90ba7 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -59,7 +59,8 @@ __global__ void
             const ComputeBasePrtOfBatch compute_base_ptr_of_batch_,
             const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index 037867d5f..0c6c0ef7a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -67,7 +67,8 @@ __global__ void
             const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
             const C0MatrixMask c0_matrix_mask)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -714,7 +715,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
         arg.Print();
 #endif
 
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
index 1f21f2d71..84edde63e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -62,7 +62,8 @@ __global__ void
             const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
             const C0MatrixMask c0_matrix_mask)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -612,7 +613,8 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
index 48a224456..d35f19417 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
@@ -75,7 +75,8 @@ __global__ void
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
             const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
index 7a4c8bf26..1eaffe705 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -52,7 +52,8 @@ __global__ void
                 e_grid_desc_mblock_mperblock_nblock_nperblock,
             const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
@@ -581,7 +582,8 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
index 31c761e09..d52879cd9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -55,7 +55,8 @@ __global__ void
             const CElementwiseOperation c_element_op,
             const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / num_batches);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_e_permute_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_e_permute_xdl.hpp
index e95bf2f81..9f9fe0f1c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_e_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_e_permute_xdl.hpp
@@ -51,7 +51,8 @@ __global__ void
                                        e_grid_desc_mblock_mperblock_nblock_nperblock,
                                    const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
@@ -456,7 +457,8 @@ struct DeviceGemmBiasEPermute_Xdl : public DeviceGemmBiasCPermute<AElementwiseOp
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
index b53927a9e..a383b0bb7 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
@@ -63,7 +63,8 @@ __global__ void
             const Block2ETileMap block_2_etile_map,
             index_t NRaw)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemmWelford::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemmWelford::template Run<HasMainKBlockLoop>(
@@ -854,7 +855,8 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
index f1185357a..4c1c3ab7b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -60,7 +60,8 @@ __global__ void
             const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock,
             const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
@@ -554,7 +555,8 @@ struct DeviceGemmMultipleDMultipleR_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
index 36e810512..248810148 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -51,7 +51,8 @@ __global__ void
                                                 e_grid_desc_mblock_mperblock_nblock_nperblock,
                                             const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
@@ -490,7 +491,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
index 48fb64637..a5051455b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
@@ -428,7 +428,7 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
                 return false;
             }
         }
-        else if(ck::get_device_name() == "gfx90a")
+        else if(ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx940")
         {
             if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
                            is_same_v<AccDataType, int32_t> || is_same_v<AccDataType, double>))
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
index 6621f25bd..7cd0ff72e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
@@ -574,7 +574,8 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
index 2a3e74a4c..8ee138f82 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
@@ -648,7 +648,8 @@ struct DeviceGemmLayerNorm_Xdl_CShuffle : public BaseOperator
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
index 03d9e26a4..76dd5a366 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
@@ -37,7 +37,8 @@ __global__ void
             const BElementwiseOperation b_element_op,
             const CDEElementwiseOperation cde_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t block_id = get_block_1d_id();
@@ -703,7 +704,8 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 4cef40444..92c20a308 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -130,7 +130,8 @@ __global__ void
             const Block2ETileMap block_2_ctile_map,
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     // offset base pointer for each work-group
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
index 34db9f2a5..c921c9f1b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
@@ -78,7 +78,8 @@ __global__ void
             const Block2CTileMap block_2_ctile_map,
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
index b60f94d14..de40d7129 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -155,7 +155,8 @@ __global__ void
             const Block2ETileMap block_2_ctile_map,
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
@@ -810,7 +811,7 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
                 return false;
             }
         }
-        else if(get_device_name() == "gfx90a")
+        else if(get_device_name() == "gfx90a" || get_device_name() == "gfx940")
         {
             if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
                            is_same_v<AccDataType, int32_t> || is_same_v<AccDataType, double>))
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
index 7bab2d040..02458bf02 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
@@ -135,7 +135,8 @@ __global__ void
             const Block2ETileMap block_2_ctile_map,
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     // offset base pointer for each work-group
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -684,7 +685,7 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
                 return false;
             }
         }
-        else if(get_device_name() == "gfx90a")
+        else if(get_device_name() == "gfx90a" || get_device_name() == "gfx940")
         {
             if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
                            is_same_v<AccDataType, int32_t> || is_same_v<AccDataType, double>))
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
index a009bcb3b..e3795060b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -38,7 +38,8 @@ __global__ void
                                 const BElementwiseOperation b_element_op,
                                 const CDEElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t block_id = get_block_1d_id();
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
index 16ba23280..bebcdceb4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
@@ -66,7 +66,8 @@ __global__ void
             const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
             const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
index 2fe550684..a3f532471 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -54,7 +54,8 @@ __global__ void
             const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
             const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
index ecc528a7e..1213cdc26 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -44,7 +44,8 @@ __global__ void
                                         c_grid_desc_mblock_mperblock_nblock_nperblock,
                                     const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
index 94e181cd4..2d4ebe707 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
@@ -57,7 +57,8 @@ __global__ void
             const C0GridDescriptor_NBlock_NPerBlock c0_grid_desc_nblock_nperblock,
             const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     // TODO ANT: separate into MMA + Epilogue
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
index 2da92466b..1979331d0 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -165,7 +165,8 @@ __global__ void
                                       const CElementwiseOperation c_element_op,
                                       const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
index 2aad7128f..8d86f3c1d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
@@ -44,7 +44,8 @@ __global__ void
             const CElementwiseOperation c_element_op,
             const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainK0BlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
index 51c578385..775b77118 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -43,7 +43,8 @@ __global__ void
             const CElementwiseOperation c_element_op,
             const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
index 949d56483..55f465a03 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
@@ -42,7 +42,8 @@ __global__ void
                                 const CElementwiseOperation c_element_op,
                                 const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
index 4a2a77ce0..b393c4897 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -30,7 +30,8 @@ __global__ void
         kernel_gemm_xdlops_v2r4r2_simplified(typename GridwiseGemm::Argument karg,
                                              const Block2CTileMap& b2c_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
 
     __shared__ uint8_t p_shared[shared_size];
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
index ffb2926c8..8259927fe 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -46,7 +46,8 @@ __global__ void
             const CElementwiseOperation c_element_op,
             const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainK0BlockLoop>(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
index 7e6dbb3b2..5d5fdae17 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -49,7 +49,8 @@ __global__ void
             const CElementwiseOperation c_element_op,
             const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
index fb1e34b98..dc83f8e98 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -53,7 +53,8 @@ __global__ void
             const CElementwiseOperation c_element_op,
             const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 4d53f0d81..319487bc0 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -27,6 +27,8 @@ enum struct MfmaInstr
     mfma_f32_16x16x8bf16,
     mfma_i32_32x32x8i8,
     mfma_i32_16x16x16i8,
+    mfma_i32_32x32x16i8,
+    mfma_i32_16x16x32i8,
     mfma_f64_16x16x4f64
 };
 
@@ -386,6 +388,50 @@ struct mfma_type<MfmaInstr::mfma_i32_16x16x16i8>
     }
 };
 
+template <>
+struct mfma_type<MfmaInstr::mfma_i32_32x32x16i8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_i32_32x32x16i8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_i32_16x16x32i8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_i32_16x16x32i8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
 template <>
 struct mfma_type<MfmaInstr::mfma_f64_16x16x4f64>
 {
@@ -524,17 +570,29 @@ struct MfmaSelector
 #endif
     }
 
+#if defined(CK_USE_AMD_MFMA_GFX940)
+    template <>
+    static constexpr auto GetMfma<int8_t, 32, 32>()
+    {
+        return MfmaInstr::mfma_i32_32x32x16i8;
+    }
+    template <>
+    static constexpr auto GetMfma<int8_t, 16, 16>()
+    {
+        return MfmaInstr::mfma_i32_16x16x32i8;
+    }
+#else
     template <>
     static constexpr auto GetMfma<int8_t, 32, 32>()
     {
         return MfmaInstr::mfma_i32_32x32x8i8;
     }
-
     template <>
     static constexpr auto GetMfma<int8_t, 16, 16>()
     {
         return MfmaInstr::mfma_i32_16x16x16i8;
     }
+#endif
 
     static constexpr auto selected_mfma = mfma_type<GetMfma<base_type, MPerXdlops, NPerXdlops>()>{};
 
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index b4be0cbee..a742496fc 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -297,6 +297,44 @@ struct intrin_mfma_i32_16x16x16i8<16, 16>
     }
 };
 
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_i32_32x32x16i8;
+
+template <>
+struct intrin_mfma_i32_32x32x16i8<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const int8x8_t& reg_a, const int8x8_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<int32x16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_i32_32x32x16_i8(bit_cast<int64_t>(reg_a),
+                                                  bit_cast<int64_t>(reg_b),
+                                                  reg_c.template AsType<int32x16_t>()[Number<0>{}],
+                                                  0,
+                                                  0,
+                                                  0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_i32_16x16x32i8;
+
+template <>
+struct intrin_mfma_i32_16x16x32i8<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const int8x8_t& reg_a, const int8x8_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<int32x4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_i32_16x16x32i8(bit_cast<int64_t>(reg_a),
+                                                 bit_cast<int64_t>(reg_b),
+                                                 reg_c.template AsType<int32x4_t>()[Number<0>{}],
+                                                 0,
+                                                 0,
+                                                 0);
+    }
+};
+
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f64_16x16x4f64;
 
@@ -306,7 +344,7 @@ struct intrin_mfma_f64_16x16x4f64<16, 16>
     template <class FloatC>
     __device__ static void Run(const double& reg_a, const double& reg_b, FloatC& reg_c)
     {
-#ifdef __gfx90a__
+#if defined(__gfx90a__) || defined(__gfx940__)
         reg_c.template AsType<double4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64(
             reg_a, reg_b, reg_c.template AsType<double4_t>()[Number<0>{}], 0, 0, 0);
 #else
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index 079b0cb86..101061191 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -898,6 +898,8 @@ struct vector_type<T, 256>
     }
 };
 
+using int64_t = long;
+
 // fp64
 using double2_t = typename vector_type<double, 2>::type;
 using double4_t = typename vector_type<double, 4>::type;
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index 8f462237f..426f68d44 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -12,9 +12,8 @@ cmake
 -save-temps=$PWD"                                                                                 \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
--D GPU_TARGETS="gfx908;gfx90a"                                                                    \
+-D GPU_TARGETS="gfx908;gfx90a;gfx940"                                                             \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}
 
-#-D AMDGPU_TARGETS=gfx90a;gfx908
diff --git a/script/cmake-ck-release.sh b/script/cmake-ck-release.sh
index 268b1ebf9..787eabbf9 100755
--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -11,9 +11,8 @@ cmake
 -D CMAKE_CXX_FLAGS="-O3"                                                                          \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=OFF                                                                                  \
--D GPU_TARGETS="gfx908;gfx90a"                                                                      \
+-D GPU_TARGETS="gfx908;gfx90a;gfx940"                                                             \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}
 
-#-D AMDGPU_TARGETS=gfx90a;gfx908
-- 
GitLab


From f53ede26e5cb58cb90d9c6c96dafc9a61d1dccc8 Mon Sep 17 00:00:00 2001
From: zjing14 <zhangjing14@gmail.com>
Date: Tue, 2 May 2023 10:30:23 -0500
Subject: [PATCH 32/71] fixed init range (#691)

---
 profiler/include/profiler/profile_gemm_splitk_impl.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/profiler/include/profiler/profile_gemm_splitk_impl.hpp b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
index 233fb15c0..4cc62509d 100644
--- a/profiler/include/profiler/profile_gemm_splitk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
@@ -72,8 +72,8 @@ bool profile_gemm_splitk_impl(int do_verification,
     {
     case 0: break;
     case 1:
-        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{0, 1});
-        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-1, 1});
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-1, 2});
         break;
     default:
         a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
@@ -94,7 +94,7 @@ bool profile_gemm_splitk_impl(int do_verification,
 
     a_device_buf.ToDevice(a_m_k.mData.data());
     b_device_buf.ToDevice(b_k_n.mData.data());
-    c_device_buf.ToDevice(c_m_n_device_result.mData.data());
+    c_device_buf.SetZero();
 
     using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<ALayout,
                                                                     BLayout,
-- 
GitLab


From 86e0190ec9abcf19b8864b6d4a35dc22f4be6350 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 3 May 2023 08:18:10 -0700
Subject: [PATCH 33/71] update daily build from rocm 5.4.3 to 5.5 (#693)

---
 Jenkinsfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 6cb458031..83559c223 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -19,7 +19,7 @@ def runShell(String command){
 
 def getDockerImageName(){
     def img
-    if (params.ROCMVERSION != "5.5" && params.ROCMVERSION != "5.6"){
+    if (params.ROCMVERSION != "5.6"){
        if (params.COMPILER_VERSION == "") {
            img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
        }
@@ -597,7 +597,7 @@ def process_results(Map conf=[:]){
 
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true
-                                              0 21 * * * % ROCMVERSION=5.4.3;COMPILER_VERSION=release;COMPILER_COMMIT=
+                                              0 21 * * * % ROCMVERSION=5.5;COMPILER_VERSION=release;COMPILER_COMMIT=
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""
 
 pipeline {
-- 
GitLab


From 4a51d2da9de3524e96f2abed5f843ff9da535db3 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 3 May 2023 08:25:25 -0700
Subject: [PATCH 34/71] Fix grouped_gemm_splitk kernels on MI300. (#694)

* replace amd_buffer_atomic_add with hip_atomic_add

* fix grouped_gemm_splitk kernels on mi300

* fix syntax

* revert experimental atomic_add changes

---------

Co-authored-by: Jing Zhang <jizhan@amd.com>
---
 example/15_grouped_gemm/run_grouped_gemm_example.inc           | 2 +-
 .../device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp    | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc
index bceff29b6..320870e0d 100644
--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
@@ -147,7 +147,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
 #else
         a_tensors_device[i]->ToDevice(a_tensors[i].mData.data());
         b_tensors_device[i]->ToDevice(b_tensors[i].mData.data());
-	c_tensors_device[i]->SetZero();
+        c_tensors_device[i]->SetZero();
 #endif
 
         p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index 26a4319ea..467a8429a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -34,7 +34,8 @@ __global__ void
         kernel_grouped_gemm_xdl_splitk(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
                                        const index_t group_count)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
     constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
     __shared__ uint8_t p_shared[shared_size];
 
-- 
GitLab


From b8635a25b2ce87f70433b32e00858c6ae0f39fde Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 3 May 2023 16:27:04 -0700
Subject: [PATCH 35/71] Fix the group of quantization_int8 kernels on MI300.
 (#695)

* replace amd_buffer_atomic_add with hip_atomic_add

* fix grouped_gemm_splitk kernels on mi300

* fix syntax

* revert experimental atomic_add changes

* fix the group of kernels from ticket 723 on MI300

---------

Co-authored-by: Jing Zhang <jizhan@amd.com>
---
 .../device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp | 5 +++--
 .../gpu/device/impl/device_gemm_multiple_d_dl.hpp            | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
index 66b47abb6..5880f5f60 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -135,7 +135,7 @@ __global__ void
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) || \
-    defined(__gfx90a__) || defined(__gfx908__))
+    defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx940__))
     // offset base pointer for each work-group
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -710,7 +710,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
 
         // check device
         if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
-             ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx908"))
+             ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx908" ||
+             ck::get_device_name() == "gfx940"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
index 636846930..4397b6f99 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
@@ -51,7 +51,7 @@ __global__ void
             const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
-    defined(__gfx90a__) || defined(__gfx1030__))
+    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__))
 
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(ABDataType);
@@ -552,7 +552,8 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD<ALayout,
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx908" ||
-           ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx1030")
+           ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx1030" ||
+           ck::get_device_name() == "gfx940")
         {
             return GridwiseGemm::CheckValidity(
                 arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.e_grid_desc_m_n_);
-- 
GitLab


From b076a02ad25892192be9fefb08a53e89fa48160c Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Thu, 4 May 2023 10:25:47 -0500
Subject: [PATCH 36/71] Optimize bf16 conversion (#664)

* Add TypeConvert class and start refactoring

* Refactor TypeConvert as a struct

* Get back to template functions type_convert

* Add a type_convert_bf16_rtn, set rtz as default

* Clean up

* Add UnaryConvertPrecision struct for high-precision workloads

* Format

* Update type_convert to UnaryConvert on threadwise level

* Update UnaryConvertPrecision

* Format

* Fix chmod

* Add a flag to pick converion method

* Format

* Remove the added flag

* Merge elementwise op with type conversion

* Move type_convert to elemwise op, update the op

* Update type_convert_precision -> bf16_convert_rtn

* Clean up

* Update comments

* Update the CK_WORKAROUND_DENORM_FIX flag handling

* Update the unneeded op to work but warn user

* Remove the message

* Use a PassThrough instead of ConvertBF16RTN to calcaulate reference

* Format

* Add missing include
---
 include/ck/ck.hpp                             |  5 +-
 .../element/unary_element_wise_operation.hpp  | 23 +++++
 .../gridwise_gemm_multiple_d_xdl_cshuffle.hpp |  2 +-
 .../grid/gridwise_gemm_xdlops_bwd_weight.hpp  |  2 +-
 .../gpu/grid/gridwise_gemm_xdlops_v2r3.hpp    |  2 +-
 .../threadwise_tensor_slice_transfer_v3r1.hpp | 27 ++----
 include/ck/utility/data_type.hpp              | 88 ++++++++++++-------
 .../cpu/reference_gemm.hpp                    | 23 ++++-
 8 files changed, 116 insertions(+), 56 deletions(-)

diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index cb20ea249..1626597ed 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -175,7 +175,10 @@
 // denorm test fix, required to work around dissue
 #ifndef CK_WORKAROUND_DENORM_FIX
 #define CK_WORKAROUND_DENORM_FIX 0
-#endif
+#elif
+// enable only on MI200
+#define CK_WORKAROUND_DENORM_FIX = CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
+#endif // CK_WORKAROUND_DENORM_FIX
 
 namespace ck {
 
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index 2987def02..ef250b8bf 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -56,6 +56,12 @@ struct PassThrough
         y = type_convert<bhalf_t>(x);
     }
 
+    template <>
+    __host__ __device__ void operator()<bhalf_t, half_t>(bhalf_t& y, const half_t& x) const
+    {
+        y = type_convert<bhalf_t>(x);
+    }
+
     template <>
     __host__ __device__ void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const
     {
@@ -86,6 +92,23 @@ struct UnaryConvert
     }
 };
 
+struct ConvertBF16RTN
+{
+    // convert to bf16 using round to nearest (rtn)
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const
+    {
+        // check Y datatype
+        static_assert(is_same<Y, bhalf_t>::value, "Data type is not supported by this operation!");
+
+        // check X datatype
+        static_assert(is_same<X, float>::value || is_same<X, half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        y = bf16_convert_rtn<Y>(x);
+    }
+};
+
 struct Scale
 {
     __host__ __device__ Scale(float scale) : scale_(scale) {}
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
index 98a71a7c2..ec1cc5399 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -96,7 +96,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
     // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
     // when mfma if fixed, remove this section and update
     // ABDataTypeAdjusted -> ABDataType throughout this file
-#if CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
+#if CK_WORKAROUND_DENORM_FIX
     using ABDataTypeAdjusted =
         conditional_t<is_same_v<ABDataType, ck::half_t>, ck::bhalf_t, ABDataType>;
 #else
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
index 1979331d0..da7ad1cac 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -266,7 +266,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
     // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
     // when mfma if fixed, remove this section and update
     // FloatABAdjusted -> FloatAB throughout this file
-#if CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
+#if CK_WORKAROUND_DENORM_FIX
     using FloatABAdjusted = conditional_t<is_same_v<FloatAB, ck::half_t>, ck::bhalf_t, FloatAB>;
 #else
     using FloatABAdjusted = FloatAB;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
index 775b77118..f4504a940 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -136,7 +136,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
     // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
     // when mfma if fixed, remove this section and update
     // FloatABAdjusted -> FloatAB throughout this file
-#if CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
+#if CK_WORKAROUND_DENORM_FIX
     using FloatABAdjusted = conditional_t<is_same_v<FloatAB, ck::half_t>, ck::bhalf_t, FloatAB>;
 #else
     using FloatABAdjusted = FloatAB;
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
index cba06f8e8..6665d765f 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -6,6 +6,7 @@
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor/static_tensor.hpp"
 
@@ -207,15 +208,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
             auto src_vector_container = src_vector_type{
                 src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
 
-            // apply SrcElementwiseOperation on src_vector_container
-            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
-                SrcData src_v;
-
-                src_element_op_(src_v, src_vector_container.template AsType<SrcData>()[i]);
-
-                src_vector_container.template AsType<SrcData>()(i) = src_v;
-            });
-
             // copy data from src_vector_container into src_thread_scratch_
             src_thread_scratch_tuple_(thread_scratch_id)
                 .template SetAsType<src_vector_t>(
@@ -318,7 +310,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                 constexpr auto data_idx_seq = generate_sequence_v2(
                     [&](auto i) { return Number<data_idx[i]>{}; }, Number<nDim>{});
 
-                // TODO type_convert is not used yet!!!!!
                 using src_vector_t = vector_type_maker_t<SrcData, SrcScalarPerVector>;
                 using dst_vector_t = vector_type_maker_t<DstData, DstScalarPerVector>;
 
@@ -342,19 +333,17 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                     Number<num_dst_vector>{});
 
                 // do data transpose
-                // TODO type_convert is not used yet!!!!!
                 transpose_vectors<SrcData, DstScalarPerVector, SrcScalarPerVector>{}(
                     src_vector_refs, dst_vector_refs);
             });
         }
-        else
-        {
-            static_ford<SliceLengths>{}([&](auto idx) {
-                // convert from SrcData to DstData here
-                dst_thread_scratch_(idx) =
-                    type_convert<DstData>(src_thread_scratch_tuple_[thread_scratch_id][idx]);
-            });
-        }
+
+        static_ford<SliceLengths>{}([&](auto idx) {
+            // apply the src elementwise op and convert to DstData under the hood if needed
+            DstData dst_v;
+            src_element_op_(dst_v, src_thread_scratch_tuple_[thread_scratch_id][idx]);
+            dst_thread_scratch_(idx) = dst_v;
+        });
 #endif
     }
 
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index 101061191..d43af8a2e 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -976,37 +976,6 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float
         uint32_t int32;
     } u = {x};
 
-    // When the exponent bits are not all 1s, then the value is zero, normal,
-    // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
-    // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
-    // This causes the bfloat16's mantissa to be incremented by 1 if the 16
-    // least significant bits of the float mantissa are greater than 0x8000,
-    // or if they are equal to 0x8000 and the least significant bit of the
-    // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
-    // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
-    // has the value 0x7f, then incrementing it causes it to become 0x00 and
-    // the exponent is incremented by one, which is the next higher FP value
-    // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
-    // with an exponent of 0x00 and a mantissa of 0x7f, it may be rounded up
-    // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
-    // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
-    // incrementing it causes it to become an exponent of 0xFF and a mantissa
-    // of 0x00, which is Inf, the next higher value to the unrounded value.
-    bool flag0 = ~u.int32 & 0x7f800000;
-
-    // When all of the exponent bits are 1, the value is Inf or NaN.
-    // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
-    // mantissa bit. Quiet NaN is indicated by the most significant mantissa
-    // bit being 1. Signaling NaN is indicated by the most significant
-    // mantissa bit being 0 but some other bit(s) being 1. If any of the
-    // lower 16 bits of the mantissa are 1, we set the least significant bit
-    // of the bfloat16 mantissa, in order to preserve signaling NaN in case
-    // the bfloat16's mantissa bits are all 0.
-    bool flag1 = !flag0 && (u.int32 & 0xffff);
-
-    u.int32 += flag0 ? 0x7fff + ((u.int32 >> 16) & 1) : 0; // Round to nearest, round to even
-    u.int32 |= flag1 ? 0x10000 : 0x0;                      // Preserve signaling NaN
-
     return uint16_t(u.int32 >> 16);
 }
 
@@ -1064,6 +1033,63 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_
     return type_convert<bhalf_t>(x_fp32);
 }
 
+// Declare a template function for bf16 conversion using RTN
+template <typename Y, typename X>
+__host__ __device__ constexpr Y bf16_convert_rtn(X x);
+
+// Convert fp32 to bf16 with RTN if higher precision is needed
+template <>
+inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, float>(float x)
+{
+    union
+    {
+        float fp32;
+        uint32_t int32;
+    } u = {x};
+
+    // When the exponent bits are not all 1s, then the value is zero, normal,
+    // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+    // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+    // This causes the bfloat16's mantissa to be incremented by 1 if the 16
+    // least significant bits of the float mantissa are greater than 0x8000,
+    // or if they are equal to 0x8000 and the least significant bit of the
+    // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+    // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+    // has the value 0x7f, then incrementing it causes it to become 0x00 and
+    // the exponent is incremented by one, which is the next higher FP value
+    // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+    // with an exponent of 0x00 and a mantissa of 0x7f, it may be rounded up
+    // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+    // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+    // incrementing it causes it to become an exponent of 0xFF and a mantissa
+    // of 0x00, which is Inf, the next higher value to the unrounded value.
+    bool flag0 = ~u.int32 & 0x7f800000;
+
+    // When all of the exponent bits are 1, the value is Inf or NaN.
+    // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+    // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+    // bit being 1. Signaling NaN is indicated by the most significant
+    // mantissa bit being 0 but some other bit(s) being 1. If any of the
+    // lower 16 bits of the mantissa are 1, we set the least significant bit
+    // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+    // the bfloat16's mantissa bits are all 0.
+    bool flag1 = !flag0 && (u.int32 & 0xffff);
+
+    u.int32 += flag0 ? 0x7fff + ((u.int32 >> 16) & 1) : 0; // Round to nearest, round to even
+    u.int32 |= flag1 ? 0x10000 : 0x0;                      // Preserve signaling NaN
+
+    return uint16_t(u.int32 >> 16);
+}
+
+// convert fp16 to bfp16 via fp32 with RTN if higher precision is needed
+template <>
+inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, half_t>(half_t x)
+{
+    float x_fp32 = static_cast<float>(x);
+
+    return bf16_convert_rtn<bhalf_t>(x_fp32);
+}
+
 template <typename T>
 struct NumericLimits
 {
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
index 6728bb1f4..be69f297b 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -6,6 +6,7 @@
 #include <iostream>
 #include <sstream>
 
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 
@@ -66,8 +67,26 @@ struct ReferenceGemm : public device::BaseOperator
                     ADataType v_a;
                     BDataType v_b;
 
-                    arg.a_element_op_(v_a, arg.a_m_k_(m, k));
-                    arg.b_element_op_(v_b, arg.b_k_n_(k, n));
+                    // use PassThrough instead of ConvertBF16RTN for reference calculation
+                    if constexpr(is_same_v<AElementwiseOperation,
+                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
+                    {
+                        ck::tensor_operation::element_wise::PassThrough{}(v_a, arg.a_m_k_(m, k));
+                    }
+                    else
+                    {
+                        arg.a_element_op_(v_a, arg.a_m_k_(m, k));
+                    }
+                    // same for B matrix
+                    if constexpr(is_same_v<BElementwiseOperation,
+                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
+                    {
+                        ck::tensor_operation::element_wise::PassThrough{}(v_b, arg.b_k_n_(k, n));
+                    }
+                    else
+                    {
+                        arg.b_element_op_(v_b, arg.b_k_n_(k, n));
+                    }
 
                     v_acc +=
                         ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
-- 
GitLab


From a1e344b1aef6e08298e4d8167aafdaa3a1f5f741 Mon Sep 17 00:00:00 2001
From: rocking <ChunYu.Lai@amd.com>
Date: Thu, 11 May 2023 20:15:02 +0800
Subject: [PATCH 37/71] Normalization/split k (#615)

---
 example/27_layernorm/CMakeLists.txt           |   3 +-
 example/27_layernorm/common.hpp               |  22 +
 example/27_layernorm/layernorm_fp16.cpp       |  39 ++
 .../27_layernorm/layernorm_splitk_fp16.cpp    |  40 ++
 ...lockwise.cpp => run_layernorm_example.inc} |  63 +-
 example/42_groupnorm/CMakeLists.txt           |   1 +
 example/42_groupnorm/common.hpp               |   1 +
 .../42_groupnorm/groupnorm_splitk_fp16.cpp    |  40 ++
 .../42_groupnorm/run_groupnorm_example.inc    |   4 +
 ...gemm_multiple_d_layernorm_xdl_cshuffle.hpp |   2 +-
 .../device/impl/device_normalization_impl.hpp |  49 +-
 .../impl/device_normalization_splitk_impl.hpp | 658 ++++++++++++++++++
 .../gridwise_normalization_naive_variance.hpp |   0
 .../gridwise_normalization_selector.hpp       |   4 +-
 .../gridwise_normalization_splitk_1st.hpp     | 252 +++++++
 .../gridwise_normalization_splitk_2nd.hpp     | 418 +++++++++++
 ...ridwise_normalization_welford_variance.hpp |   0
 17 files changed, 1514 insertions(+), 82 deletions(-)
 create mode 100644 example/27_layernorm/common.hpp
 create mode 100644 example/27_layernorm/layernorm_fp16.cpp
 create mode 100644 example/27_layernorm/layernorm_splitk_fp16.cpp
 rename example/27_layernorm/{layernorm_blockwise.cpp => run_layernorm_example.inc} (58%)
 create mode 100644 example/42_groupnorm/groupnorm_splitk_fp16.cpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
 rename include/ck/tensor_operation/gpu/grid/{ => normalization}/gridwise_normalization_naive_variance.hpp (100%)
 rename include/ck/tensor_operation/gpu/grid/{ => normalization}/gridwise_normalization_selector.hpp (98%)
 create mode 100644 include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp
 rename include/ck/tensor_operation/gpu/grid/{ => normalization}/gridwise_normalization_welford_variance.hpp (100%)

diff --git a/example/27_layernorm/CMakeLists.txt b/example/27_layernorm/CMakeLists.txt
index d96deae45..94c23ce77 100644
--- a/example/27_layernorm/CMakeLists.txt
+++ b/example/27_layernorm/CMakeLists.txt
@@ -1 +1,2 @@
-add_example_executable(example_layernorm_blockwise layernorm_blockwise.cpp)
+add_example_executable(example_layernorm_fp16 layernorm_fp16.cpp)
+add_example_executable(example_layernorm_splitk_fp16 layernorm_splitk_fp16.cpp)
diff --git a/example/27_layernorm/common.hpp b/example/27_layernorm/common.hpp
new file mode 100644
index 000000000..8d833a3ae
--- /dev/null
+++ b/example/27_layernorm/common.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
diff --git a/example/27_layernorm/layernorm_fp16.cpp b/example/27_layernorm/layernorm_fp16.cpp
new file mode 100644
index 000000000..c15ffabf5
--- /dev/null
+++ b/example/27_layernorm/layernorm_fp16.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using PassThrough     = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank         = 2;
+constexpr int NumReduceDim = 1;
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
+                                                          GammaDataType,
+                                                          BetaDataType,
+                                                          ComputeDataType,
+                                                          YDataType,
+                                                          PassThrough,
+                                                          Rank,
+                                                          NumReduceDim,
+                                                          256, // BlockSize
+                                                          8,   // ClusterM
+                                                          32,  // ClusterK
+                                                          1,   // SliceM
+                                                          8,   // SliceK
+                                                          1,   // XYVectorDim (0=M, 1=K)
+                                                          8,   // SrcScalarPerVector
+                                                          1,   // GammaVecDim (0=M, 1=K)
+                                                          8,   // GammaScalarPerVector
+                                                          1,   // BetaVecDim (0=M, 1=K)
+                                                          8,   // BetaScalarPerVector
+                                                          8>;  // OutScalarPerVector
+#include "run_layernorm_example.inc"
+
+int main() { return run_groupnorm_example<DeviceInstance>(); }
diff --git a/example/27_layernorm/layernorm_splitk_fp16.cpp b/example/27_layernorm/layernorm_splitk_fp16.cpp
new file mode 100644
index 000000000..01ee7161e
--- /dev/null
+++ b/example/27_layernorm/layernorm_splitk_fp16.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using PassThrough     = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank         = 2;
+constexpr int NumReduceDim = 1;
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationSplitKImpl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                ComputeDataType,
+                                                                YDataType,
+                                                                PassThrough,
+                                                                Rank,
+                                                                NumReduceDim,
+                                                                256, // BlockSize
+                                                                8,   // ClusterM
+                                                                32,  // ClusterK
+                                                                1,   // SliceM
+                                                                8,   // SliceK
+                                                                1,   // XYVectorDim (0=M, 1=K)
+                                                                8,   // XScalarPerVector
+                                                                1,   // GammaVecDim (0=M, 1=K)
+                                                                8,   // GammaScalarPerVector
+                                                                1,   // BetaVecDim (0=M, 1=K)
+                                                                8,   // BetaScalarPerVector
+                                                                8>;  // YScalarPerVector
+
+#include "run_layernorm_example.inc"
+
+int main() { return run_groupnorm_example<DeviceInstance>(); }
diff --git a/example/27_layernorm/layernorm_blockwise.cpp b/example/27_layernorm/run_layernorm_example.inc
similarity index 58%
rename from example/27_layernorm/layernorm_blockwise.cpp
rename to example/27_layernorm/run_layernorm_example.inc
index 7d91b69d0..678d8df28 100644
--- a/example/27_layernorm/layernorm_blockwise.cpp
+++ b/example/27_layernorm/run_layernorm_example.inc
@@ -1,58 +1,10 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <getopt.h>
-
-#include "ck/ck.hpp"
-#include "ck/utility/reduction_enums.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
-#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_common_util.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/literals.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
-
-using XDataType       = ck::half_t;
-using GammaDataType   = ck::half_t;
-using BetaDataType    = ck::half_t;
-using YDataType       = ck::half_t;
-using ComputeDataType = float;
-using PassThrough     = ck::tensor_operation::element_wise::PassThrough;
-
-constexpr int Rank         = 2;
-constexpr int NumReduceDim = 1;
-
-using DeviceInstance =
-    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
-                                                          GammaDataType,
-                                                          BetaDataType,
-                                                          ComputeDataType,
-                                                          YDataType,
-                                                          PassThrough,
-                                                          Rank,
-                                                          NumReduceDim,
-                                                          256, // BlockSize
-                                                          8,   // ClusterM
-                                                          32,  // ClusterK
-                                                          1,   // SliceM
-                                                          8,   // SliceK
-                                                          1,   // SrcVecDim (0=M, 1=K)
-                                                          8,   // SrcScalarPerVector
-                                                          1,   // GammaVecDim (0=M, 1=K)
-                                                          8,   // GammaScalarPerVector
-                                                          1,   // BetaVecDim (0=M, 1=K)
-                                                          8,   // BetaScalarPerVector
-                                                          8>;  // OutScalarPerVector
-
-int main()
+#pragma once
+
+template <typename DeviceInstance>
+int run_groupnorm_example()
 {
     bool time_kernel = false;
 
@@ -111,6 +63,10 @@ int main()
         return 1;
     };
 
+    size_t workspace_sz = device_instance.GetWorkSpaceSize(argument_ptr.get());
+    DeviceMem workspace_dev(workspace_sz);
+    device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
     auto invoker_ptr = device_instance.MakeInvokerPointer();
     invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
@@ -133,7 +89,8 @@ int main()
         ref_invoker.Run(ref_argument);
 
         y_dev.FromDevice(y.mData.data());
-        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results d1", 1e-3, 1e-3);
+        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
     }
+
     return (pass ? 0 : 1);
 }
diff --git a/example/42_groupnorm/CMakeLists.txt b/example/42_groupnorm/CMakeLists.txt
index a9990c5d8..e8c306ac5 100644
--- a/example/42_groupnorm/CMakeLists.txt
+++ b/example/42_groupnorm/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp)
+add_example_executable(example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp)
 add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp)
diff --git a/example/42_groupnorm/common.hpp b/example/42_groupnorm/common.hpp
index e159abf3e..780154b26 100644
--- a/example/42_groupnorm/common.hpp
+++ b/example/42_groupnorm/common.hpp
@@ -12,6 +12,7 @@
 #include "ck/ck.hpp"
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp"
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
 
 #include "ck/library/utility/fill.hpp"
diff --git a/example/42_groupnorm/groupnorm_splitk_fp16.cpp b/example/42_groupnorm/groupnorm_splitk_fp16.cpp
new file mode 100644
index 000000000..fd4bfe380
--- /dev/null
+++ b/example/42_groupnorm/groupnorm_splitk_fp16.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using YElementOp      = ck::tensor_operation::element_wise::Swish;
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationSplitKImpl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                ComputeDataType,
+                                                                YDataType,
+                                                                YElementOp,
+                                                                Rank,
+                                                                NumReduceDim,
+                                                                256, // BlockSize
+                                                                1,   // ClusterM
+                                                                256, // ClusterK
+                                                                1,   // SliceM
+                                                                16,  // SliceK
+                                                                1,   // SrcVecDim (0=M, 1=K)
+                                                                2,   // SrcScalarPerVector
+                                                                1,   // GammaVecDim (0=M, 1=K)
+                                                                2,   // GammaScalarPerVector
+                                                                1,   // BetaVecDim (0=M, 1=K)
+                                                                2,   // BetaScalarPerVector
+                                                                2>;  // OutScalarPerVector
+
+#include "run_groupnorm_example.inc"
+
+int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); }
diff --git a/example/42_groupnorm/run_groupnorm_example.inc b/example/42_groupnorm/run_groupnorm_example.inc
index bd7eb98ca..d1016a3b1 100644
--- a/example/42_groupnorm/run_groupnorm_example.inc
+++ b/example/42_groupnorm/run_groupnorm_example.inc
@@ -73,6 +73,10 @@ int run_groupnorm_example(int argc, char* argv[])
         return 1;
     };
 
+    size_t workspace_sz = device_instance.GetWorkSpaceSize(argument_ptr.get());
+    DeviceMem workspace_dev(workspace_sz);
+    device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
     auto invoker_ptr = device_instance.MakeInvokerPointer();
     float ave_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true});
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
index a383b0bb7..580087e00 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
@@ -807,7 +807,7 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
         // workspace for welford intermediate mean
         workspace_size += gemm_welford_size * sizeof(EMeanVarDataType) + 64;
 
-        // workspace for welford intermediate mean
+        // workspace for welford intermediate variance
         workspace_size += gemm_welford_size * sizeof(EMeanVarDataType) + 64;
 
         // workspace for welford intermediate count
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
index bb62332d1..6a8037a32 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
@@ -10,8 +10,7 @@
 #include "ck/tensor_operation/gpu/device/device_normalization.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_normalization_selector.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
+#include "ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 
@@ -20,6 +19,10 @@ namespace tensor_operation {
 namespace device {
 
 // Y = Normalization(X, Beta, Gamma)
+// M: Invarient length
+// K: Reduce length (Calculate mean and variance along K dimension)
+// eg. Length = [N, C, H, W], reduce dim = [C, H, W]
+// Then, M = N, K = C * H * W
 template <typename XDataType,
           typename GammaDataType,
           typename BetaDataType,
@@ -68,7 +71,6 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
 
     static auto MakeSrc2dDescriptor(const std::vector<index_t>& inLengths,
                                     const std::vector<index_t>& inStrides,
-                                    int blkGroupSize,
                                     int numBlockTileIteration)
     {
         constexpr index_t NumInvariantDim  = Rank - NumReduceDim;
@@ -117,10 +119,9 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
         const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
         const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
 
-        const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration;
         const auto inPad_M =
             math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
-        const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;
+        const auto inPad_K = K_BlockTileSize * numBlockTileIteration - reduceLength;
 
         auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
             in_grid_desc_m_k,
@@ -132,7 +133,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
         return (in_grid_desc_m_k_padded);
     };
 
-    using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1));
+    using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1));
 
     struct Argument : public BaseArgument
     {
@@ -162,26 +163,22 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
             gammaStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(gammaStrides, reduceDims);
             betaStrides_  = shuffle_tensor_dimensions<Rank, NumReduceDim>(betaStrides, reduceDims);
 
-            long_index_t invariant_total_length;
-            long_index_t reduce_total_length;
+            long_index_t invariant_length;
+            long_index_t reduce_length;
 
-            std::tie(invariant_total_length, reduce_total_length) =
+            std::tie(invariant_length, reduce_length) =
                 get_2d_lengths<Rank, NumReduceDim>(Lengths_);
 
-            blkGroupSize_          = 1;
-            numBlockTileIteration_ = (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize;
+            numBlockTileIteration_ = math::integer_divide_ceil(reduce_length, K_BlockTileSize);
 
-            gridSize_ = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
-                        M_BlockTileSize * blkGroupSize_;
+            gridSize_ = math::integer_divide_ceil(invariant_length, M_BlockTileSize);
 
-            x_grid_desc_m_k_ =
-                MakeSrc2dDescriptor(Lengths_, xStrides_, blkGroupSize_, numBlockTileIteration_);
+            x_grid_desc_m_k_ = MakeSrc2dDescriptor(Lengths_, xStrides_, numBlockTileIteration_);
             gamma_grid_desc_m_k_ =
-                MakeSrc2dDescriptor(Lengths_, gammaStrides_, blkGroupSize_, numBlockTileIteration_);
+                MakeSrc2dDescriptor(Lengths_, gammaStrides_, numBlockTileIteration_);
             beta_grid_desc_m_k_ =
-                MakeSrc2dDescriptor(Lengths_, betaStrides_, blkGroupSize_, numBlockTileIteration_);
-            y_grid_desc_m_k_ =
-                MakeSrc2dDescriptor(Lengths_, yStrides_, blkGroupSize_, numBlockTileIteration_);
+                MakeSrc2dDescriptor(Lengths_, betaStrides_, numBlockTileIteration_);
+            y_grid_desc_m_k_ = MakeSrc2dDescriptor(Lengths_, yStrides_, numBlockTileIteration_);
 
             isSweeponce_ =
                 x_grid_desc_m_k_.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize;
@@ -202,7 +199,6 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
 
         YElementwiseOperation y_elementwise_op_;
 
-        int blkGroupSize_;
         int numBlockTileIteration_;
         size_t gridSize_;
 
@@ -286,6 +282,9 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
 
                 if(p_arg_->invariant_lowest_length % XSrcVectorSize != 0)
                     return false;
+
+                if(p_arg_->invariant_lowest_length % YDstVectorSize != 0)
+                    return false;
             };
         }
         else
@@ -295,12 +294,12 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
 
             if(p_arg_->Lengths_[Rank - 1] % XSrcVectorSize != 0)
                 return false;
-        };
 
-        if(p_arg_->Lengths_[Rank - 1] % YDstVectorSize != 0)
-        {
-            return false;
-        }
+            if(p_arg_->Lengths_[Rank - 1] % YDstVectorSize != 0)
+            {
+                return false;
+            }
+        };
 
         // if fastest dim is not reduced
         if constexpr(GammaSrcVectorDim == 0)
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
new file mode 100644
index 000000000..0026a8759
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
@@ -0,0 +1,658 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp"
+#include "ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+template <typename GridwiseWelford,
+          typename XDataType,
+          typename MeanVarDataType,
+          typename ComputeDataType,
+          typename XGridDesc_M_K,
+          typename MeanVarGridDesc_M_KBlock>
+__global__ void
+kernel_normalizationSplitK1st(const XGridDesc_M_K x_grid_desc_m_k,
+                              const MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock,
+                              index_t num_k_block_tile_iteration,
+                              const XDataType* const __restrict__ p_x_global,
+                              MeanVarDataType* const __restrict__ p_welford_mean,
+                              MeanVarDataType* const __restrict__ p_welford_variance,
+                              int32_t* const __restrict__ p_welford_count)
+{
+    GridwiseWelford::Run(x_grid_desc_m_k,
+                         mean_var_grid_desc_m_kblock,
+                         num_k_block_tile_iteration,
+                         p_x_global,
+                         p_welford_mean,
+                         p_welford_variance,
+                         p_welford_count);
+};
+
+template <typename GridwiseWelfordNormalization,
+          typename MeanVarDataType,
+          typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename YDataType,
+          typename ComputeDataType,
+          typename YElementwiseOperation,
+          typename MeanVarGridDesc_M_KBlock,
+          typename CountGridDesc_M_KBlock,
+          typename XYGammaBetaGridDesc_M_K>
+__global__ void
+kernel_normalizationSplitK2nd(const MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock,
+                              const CountGridDesc_M_KBlock count_grid_desc_m_kblock,
+                              const XYGammaBetaGridDesc_M_K x_grid_desc_m_k,
+                              const XYGammaBetaGridDesc_M_K gamma_grid_desc_m_k,
+                              const XYGammaBetaGridDesc_M_K beta_grid_desc_m_k,
+                              const XYGammaBetaGridDesc_M_K y_grid_desc_m_k,
+                              index_t num_k_mean_var_count_iteration,
+                              index_t num_k_block_tile_iteration,
+                              index_t k_grid_size,
+                              ComputeDataType epsilon,
+                              const MeanVarDataType* const p_mean_global,
+                              const MeanVarDataType* const p_variance_global,
+                              const int32_t* const p_welford_count_global,
+                              const XDataType* const __restrict__ p_x_global,
+                              const GammaDataType* const __restrict__ p_gamma_global,
+                              const BetaDataType* const __restrict__ p_beta_global,
+                              YDataType* const __restrict__ p_y_global,
+                              const YElementwiseOperation y_elementwise_op)
+{
+    GridwiseWelfordNormalization::Run(mean_var_grid_desc_m_kblock,
+                                      count_grid_desc_m_kblock,
+                                      x_grid_desc_m_k,
+                                      gamma_grid_desc_m_k,
+                                      beta_grid_desc_m_k,
+                                      y_grid_desc_m_k,
+                                      num_k_mean_var_count_iteration,
+                                      num_k_block_tile_iteration,
+                                      k_grid_size,
+                                      epsilon,
+                                      p_mean_global,
+                                      p_variance_global,
+                                      p_welford_count_global,
+                                      p_x_global,
+                                      p_gamma_global,
+                                      p_beta_global,
+                                      p_y_global,
+                                      y_elementwise_op);
+};
+} // namespace ck
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Y = Normalization(X, Beta, Gamma)
+// M: Invarient length
+// K: Reduce length (Calculate mean and variance along K dimension)
+// eg. Length = [N, C, H, W], reduce dim = [C, H, W]
+// Then, M = N, K = C * H * W
+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename ComputeDataType,
+          typename YDataType,
+          typename YElementwiseOperation,
+          index_t Rank,
+          index_t NumReduceDim,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XYVectorDim,
+          index_t XSrcVectorSize,
+          index_t GammaSrcVectorDim,
+          index_t GammaSrcVectorSize,
+          index_t BetaSrcVectorDim,
+          index_t BetaSrcVectorSize,
+          index_t YDstVectorSize>
+struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType,
+                                                                  GammaDataType,
+                                                                  BetaDataType,
+                                                                  ComputeDataType,
+                                                                  YDataType,
+                                                                  YElementwiseOperation,
+                                                                  Rank,
+                                                                  NumReduceDim>
+{
+    using MeanVarDataType = ComputeDataType;
+
+    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize);
+    static_assert(
+        ((GammaSrcVectorDim == 0 && MThreadSliceSize % GammaSrcVectorSize == 0) ||
+         (GammaSrcVectorDim == 1 && KThreadSliceSize % GammaSrcVectorSize == 0)),
+        "Invalid thread slice sizes and/or gamma vector sizes configuration, please check!");
+
+    static_assert(
+        ((BetaSrcVectorDim == 0 && MThreadSliceSize % BetaSrcVectorSize == 0) ||
+         (BetaSrcVectorDim == 1 && KThreadSliceSize % BetaSrcVectorSize == 0)),
+        "Invalid thread slice sizes and/or beta vector sizes configuration, please check!");
+
+    using PassThrough = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    static auto MakeSrc2dDescriptor(const std::vector<index_t>& inLengths,
+                                    const std::vector<index_t>& inStrides,
+                                    int kBlockSize,
+                                    int numBlockTileIteration)
+    {
+        constexpr index_t NumInvariantDim  = Rank - NumReduceDim;
+        static constexpr index_t numSrcDim = Rank;
+        static constexpr bool reduceAllDim = (NumInvariantDim == 0);
+
+        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
+        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
+
+        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+
+        const auto in_grid_desc_m_k = [&]() {
+            if constexpr(reduceAllDim)
+            {
+                const auto one_dim_inDesc = transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(tupleSrcLengths)),
+                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
+                    make_tuple(Sequence<0>{}));
+
+                return transform_tensor_descriptor(one_dim_inDesc,
+                                                   make_tuple(make_unmerge_transform(make_tuple(
+                                                       1, one_dim_inDesc.GetLength(Number<0>{})))),
+                                                   make_tuple(Sequence<0>{}),
+                                                   make_tuple(Sequence<0, 1>{}));
+            }
+            else
+            {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
+                const auto reduceDimLengths =
+                    make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
+                const auto invariantDimLengths =
+                    make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
+
+                return transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(invariantDimLengths),
+                               make_merge_transform(reduceDimLengths)),
+                    make_tuple(InvariantDims{}, ReduceDims{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }();
+
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
+
+        const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K = reduceSizePerBlock * kBlockSize - reduceLength;
+
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (in_grid_desc_m_k_padded);
+    };
+
+    template <typename DoPads, index_t MPerTile, index_t KPerTile>
+    static auto MakeMeanVarDescriptor_M_K(index_t M, index_t K)
+    {
+        const auto grid_desc_m_k =
+            make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(K, I1));
+        return PadTensorDescriptor(grid_desc_m_k, make_tuple(MPerTile, KPerTile), DoPads{});
+    }
+
+    template <typename DoPads, index_t MPerTile, index_t KPerTile>
+    static auto MakeCountDescriptor_M_K(index_t M, index_t K)
+    {
+        const auto grid_desc_m_k =
+            make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I0, I1));
+        return PadTensorDescriptor(grid_desc_m_k, make_tuple(MPerTile, KPerTile), DoPads{});
+    }
+
+    using SrcGridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1));
+    using Kernel1MeanVarGridDesc_M_KBlock =
+        decltype(MakeMeanVarDescriptor_M_K<Sequence<true, false>, 1, 1>(1, 1));
+
+    using Kernel2MeanVarGridDesc_M_KBlock =
+        decltype(MakeMeanVarDescriptor_M_K<Sequence<true, true>, 1, 1>(1, 1));
+
+    using Kernel2CountGridDesc_M_KBlock =
+        decltype(MakeCountDescriptor_M_K<Sequence<true, true>, 1, 1>(1, 1));
+
+    using GridwiseWelford = GridwiseNormalizationSplitK1st<XDataType,
+                                                           ComputeDataType,
+                                                           MeanVarDataType,
+                                                           SrcGridDesc_M_K,
+                                                           Kernel1MeanVarGridDesc_M_KBlock,
+                                                           BlockSize,
+                                                           MThreadClusterSize,
+                                                           KThreadClusterSize,
+                                                           MThreadSliceSize,
+                                                           KThreadSliceSize,
+                                                           XYVectorDim,
+                                                           XSrcVectorSize>;
+
+    using GridwiseWelfordNormalization =
+        GridwiseNormalizationSplitK2nd<MeanVarDataType,
+                                       XDataType,
+                                       GammaDataType,
+                                       BetaDataType,
+                                       YDataType,
+                                       ComputeDataType,
+                                       YElementwiseOperation,
+                                       Kernel2MeanVarGridDesc_M_KBlock,
+                                       Kernel2CountGridDesc_M_KBlock,
+                                       SrcGridDesc_M_K,
+                                       BlockSize,
+                                       MThreadClusterSize,
+                                       KThreadClusterSize,
+                                       MThreadSliceSize,
+                                       KThreadSliceSize,
+                                       XYVectorDim,
+                                       XSrcVectorSize,
+                                       GammaSrcVectorDim,
+                                       GammaSrcVectorSize,
+                                       BetaSrcVectorDim,
+                                       BetaSrcVectorSize,
+                                       XYVectorDim,
+                                       YDstVectorSize>;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::vector<index_t> lengths,
+                 const std::vector<index_t> xStrides,
+                 const std::vector<index_t> gammaStrides,
+                 const std::vector<index_t> betaStrides,
+                 const std::vector<index_t> yStrides,
+                 const std::vector<index_t> reduceDims,
+                 YElementwiseOperation y_elementwise_op,
+                 double epsilon,
+                 const XDataType* p_x,
+                 const GammaDataType* p_gamma,
+                 const BetaDataType* p_beta,
+                 YDataType* p_y)
+            : p_x_(p_x),
+              p_gamma_(p_gamma),
+              p_beta_(p_beta),
+              p_y_(p_y),
+              p_workspace_mean_{nullptr},
+              p_workspace_var_{nullptr},
+              p_workspace_count_{nullptr},
+              y_elementwise_op_(y_elementwise_op)
+        {
+            epsilon_ = static_cast<ComputeDataType>(epsilon);
+
+            Lengths_      = shuffle_tensor_dimensions<Rank, NumReduceDim>(lengths, reduceDims);
+            xStrides_     = shuffle_tensor_dimensions<Rank, NumReduceDim>(xStrides, reduceDims);
+            yStrides_     = shuffle_tensor_dimensions<Rank, NumReduceDim>(yStrides, reduceDims);
+            gammaStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(gammaStrides, reduceDims);
+            betaStrides_  = shuffle_tensor_dimensions<Rank, NumReduceDim>(betaStrides, reduceDims);
+
+            std::tie(MRaw_, KRaw_) = get_2d_lengths<Rank, NumReduceDim>(Lengths_);
+
+            numBlockTileIteration_ = 1;
+            while(true)
+            {
+                int testKGridSize =
+                    math::integer_divide_ceil(KRaw_, K_BlockTileSize * numBlockTileIteration_);
+
+                // we want the kGridSize_ be not more than 128
+                if(testKGridSize <= 128)
+                    break;
+
+                ++numBlockTileIteration_;
+            };
+
+            kGridSize_ = math::integer_divide_ceil(KRaw_, K_BlockTileSize * numBlockTileIteration_);
+            gridSize_  = math::integer_divide_ceil(MRaw_, M_BlockTileSize) * kGridSize_;
+
+            // We do not use vector load for mean, var and count
+            static constexpr index_t K_MeanVarCountBlockTileSize = KThreadClusterSize;
+
+            numMeanVarCountIteration_ =
+                math::integer_divide_ceil(kGridSize_, K_MeanVarCountBlockTileSize);
+
+            x_grid_desc_m_k_ =
+                MakeSrc2dDescriptor(Lengths_, xStrides_, kGridSize_, numBlockTileIteration_);
+            gamma_grid_desc_m_k_ =
+                MakeSrc2dDescriptor(Lengths_, gammaStrides_, kGridSize_, numBlockTileIteration_);
+            beta_grid_desc_m_k_ =
+                MakeSrc2dDescriptor(Lengths_, betaStrides_, kGridSize_, numBlockTileIteration_);
+            y_grid_desc_m_k_ =
+                MakeSrc2dDescriptor(Lengths_, yStrides_, kGridSize_, numBlockTileIteration_);
+
+            // We don't need to pad in K dimension for Welford1. Set KPerTile 1.
+            kernel1_mean_var_grid_desc_m_kblock_ =
+                MakeMeanVarDescriptor_M_K<Sequence<true, false>, M_BlockTileSize, 1>(MRaw_,
+                                                                                     kGridSize_);
+
+            kernel2_mean_var_grid_desc_m_kblock_ =
+                MakeMeanVarDescriptor_M_K<Sequence<true, true>,
+                                          M_BlockTileSize,
+                                          K_MeanVarCountBlockTileSize>(MRaw_, kGridSize_);
+
+            kernel2_count_grid_desc_m_kblock_ =
+                MakeCountDescriptor_M_K<Sequence<true, true>,
+                                        M_BlockTileSize,
+                                        K_MeanVarCountBlockTileSize>(MRaw_, kGridSize_);
+        }
+
+        ComputeDataType epsilon_;
+
+        const XDataType* p_x_;
+        const GammaDataType* p_gamma_;
+        const BetaDataType* p_beta_;
+        YDataType* p_y_;
+        void* p_workspace_mean_;
+        void* p_workspace_var_;
+        void* p_workspace_count_;
+
+        std::vector<index_t> Lengths_;
+        std::vector<index_t> xStrides_;
+        std::vector<index_t> gammaStrides_;
+        std::vector<index_t> betaStrides_;
+        std::vector<index_t> yStrides_;
+
+        YElementwiseOperation y_elementwise_op_;
+
+        int kGridSize_;
+        int numMeanVarCountIteration_;
+        int numBlockTileIteration_;
+        size_t gridSize_;
+
+        SrcGridDesc_M_K x_grid_desc_m_k_;
+        SrcGridDesc_M_K gamma_grid_desc_m_k_;
+        SrcGridDesc_M_K beta_grid_desc_m_k_;
+        SrcGridDesc_M_K y_grid_desc_m_k_;
+
+        Kernel1MeanVarGridDesc_M_KBlock kernel1_mean_var_grid_desc_m_kblock_;
+        Kernel2MeanVarGridDesc_M_KBlock kernel2_mean_var_grid_desc_m_kblock_;
+        Kernel2CountGridDesc_M_KBlock kernel2_count_grid_desc_m_kblock_;
+
+        index_t MRaw_; // invarient length
+        index_t KRaw_; // reduce length
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(arg.p_workspace_mean_ == nullptr || arg.p_workspace_var_ == nullptr ||
+               arg.p_workspace_count_ == nullptr)
+                throw std::runtime_error("wrong! WorkSpace pointer has not been set");
+
+            auto kernel1 = kernel_normalizationSplitK1st<GridwiseWelford,
+                                                         XDataType,
+                                                         MeanVarDataType,
+                                                         ComputeDataType,
+                                                         SrcGridDesc_M_K,
+                                                         Kernel1MeanVarGridDesc_M_KBlock>;
+
+            auto kernel2 = kernel_normalizationSplitK2nd<GridwiseWelfordNormalization,
+                                                         MeanVarDataType,
+                                                         XDataType,
+                                                         GammaDataType,
+                                                         BetaDataType,
+                                                         YDataType,
+                                                         ComputeDataType,
+                                                         YElementwiseOperation,
+                                                         Kernel2MeanVarGridDesc_M_KBlock,
+                                                         Kernel2CountGridDesc_M_KBlock,
+                                                         SrcGridDesc_M_K>;
+
+            float avg_time = 0;
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel1,
+                                               dim3(arg.gridSize_),
+                                               dim3(BlockSize),
+                                               0,
+                                               arg.x_grid_desc_m_k_,
+                                               arg.kernel1_mean_var_grid_desc_m_kblock_,
+                                               arg.numBlockTileIteration_,
+                                               arg.p_x_,
+                                               static_cast<MeanVarDataType*>(arg.p_workspace_mean_),
+                                               static_cast<MeanVarDataType*>(arg.p_workspace_var_),
+                                               static_cast<int32_t*>(arg.p_workspace_count_));
+
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel2,
+                                               dim3(arg.gridSize_),
+                                               dim3(BlockSize),
+                                               0,
+                                               arg.kernel2_mean_var_grid_desc_m_kblock_,
+                                               arg.kernel2_count_grid_desc_m_kblock_,
+                                               arg.x_grid_desc_m_k_,
+                                               arg.gamma_grid_desc_m_k_,
+                                               arg.beta_grid_desc_m_k_,
+                                               arg.y_grid_desc_m_k_,
+                                               arg.numMeanVarCountIteration_,
+                                               arg.numBlockTileIteration_,
+                                               arg.kGridSize_,
+                                               arg.epsilon_,
+                                               static_cast<MeanVarDataType*>(arg.p_workspace_mean_),
+                                               static_cast<MeanVarDataType*>(arg.p_workspace_var_),
+                                               static_cast<int32_t*>(arg.p_workspace_count_),
+                                               arg.p_x_,
+                                               arg.p_gamma_,
+                                               arg.p_beta_,
+                                               arg.p_y_,
+                                               arg.y_elementwise_op_);
+
+            return avg_time;
+        };
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        };
+    };
+
+    size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
+    {
+        const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
+
+        size_t workspace_size = 0;
+
+        int welford_size = pArg_->MRaw_ * pArg_->kGridSize_;
+
+        // workspace for welford intermediate mean
+        workspace_size += welford_size * sizeof(MeanVarDataType) + 64;
+
+        // workspace for welford intermediate variance
+        workspace_size += welford_size * sizeof(MeanVarDataType) + 64;
+
+        // workspace for welford intermediate count
+        workspace_size += pArg_->kGridSize_ * sizeof(int32_t) + 64;
+
+        return (workspace_size);
+    };
+
+    void SetWorkSpacePointer(BaseArgument* pArg, void* p_workspace) const override
+    {
+        Argument* pArg_ = dynamic_cast<Argument*>(pArg);
+
+        pArg_->p_workspace_ = p_workspace;
+
+        int welford_size = pArg_->MRaw_ * pArg_->kGridSize_;
+
+        // setup buffer used for intermediate welford mean
+        pArg_->p_workspace_mean_ = static_cast<char*>(pArg_->p_workspace_);
+
+        index_t mean_space_sz = welford_size * sizeof(MeanVarDataType);
+        mean_space_sz         = math::integer_least_multiple(mean_space_sz, 64);
+
+        // setup buffer used for intermediate welford varirance
+        pArg_->p_workspace_var_ = reinterpret_cast<char*>(pArg_->p_workspace_mean_) + mean_space_sz;
+
+        index_t variance_space_sz = welford_size * sizeof(MeanVarDataType);
+        variance_space_sz         = math::integer_least_multiple(variance_space_sz, 64);
+
+        // setup buffer used for intermediate welford count
+        pArg_->p_workspace_count_ =
+            reinterpret_cast<char*>(pArg_->p_workspace_var_) + variance_space_sz;
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* p_arg_ = dynamic_cast<const Argument*>(p_arg);
+
+        constexpr index_t NumInvariantDim = Rank - NumReduceDim;
+
+        if constexpr(XYVectorDim == 0)
+        {
+            if constexpr(NumInvariantDim == 0)
+            {
+                return false;
+            }
+            else
+            {
+                if(p_arg_->xStrides_[NumInvariantDim - 1] != 1)
+                    return false;
+
+                if(p_arg_->invariant_lowest_length % XSrcVectorSize != 0)
+                    return false;
+
+                if(p_arg_->invariant_lowest_length % YDstVectorSize != 0)
+                    return false;
+            };
+        }
+        else
+        {
+            if(p_arg_->xStrides_[Rank - 1] != 1)
+                return false;
+
+            if(p_arg_->Lengths_[Rank - 1] % XSrcVectorSize != 0)
+                return false;
+
+            if(p_arg_->Lengths_[Rank - 1] % YDstVectorSize != 0)
+                return false;
+        };
+
+        // if fastest dim is not reduced
+        if constexpr(GammaSrcVectorDim == 0)
+        {
+            if(p_arg_->gammaStrides_[NumInvariantDim - 1] != 1)
+                return false;
+
+            if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0)
+                return false;
+        }
+        else // if fastest dim is reduced
+        {
+            if(p_arg_->gammaStrides_[Rank - 1] != 1)
+                return false;
+
+            if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0)
+                return false;
+        }
+
+        // if fastest dim is not reduced
+        if constexpr(BetaSrcVectorDim == 0)
+        {
+            if(p_arg_->betaStrides_[NumInvariantDim - 1] != 1)
+                return false;
+
+            if(p_arg_->invariant_lowest_length % BetaSrcVectorSize != 0)
+                return false;
+        }
+        else // if fastest dim is reduced
+        {
+            if(p_arg_->betaStrides_[Rank - 1] != 1)
+                return false;
+
+            if(p_arg_->Lengths_[Rank - 1] % BetaSrcVectorSize != 0)
+                return false;
+        }
+
+        if(p_arg_->kGridSize_ <= 1)
+            return false;
+
+        return true;
+    };
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::vector<index_t> lengths,
+                        const std::vector<index_t> xStrides,
+                        const std::vector<index_t> gammaStrides,
+                        const std::vector<index_t> betaStrides,
+                        const std::vector<index_t> yStrides,
+                        const std::vector<index_t> reduceDims,
+                        double epsilon,
+                        const void* p_x,
+                        const void* p_gamma,
+                        const void* p_beta,
+                        void* p_y,
+                        void* p_saveMean,
+                        void* p_saveInvVar,
+                        YElementwiseOperation y_elementwise_op) override
+    {
+        // TODO
+        // Optional cache of the intermediate results (mean and InvVariance) during the
+        // forward pass could speedup in the backward
+        ignore = p_saveMean;
+        ignore = p_saveInvVar;
+
+        return std::make_unique<Argument>(lengths,
+                                          xStrides,
+                                          gammaStrides,
+                                          betaStrides,
+                                          yStrides,
+                                          reduceDims,
+                                          y_elementwise_op,
+                                          epsilon,
+                                          static_cast<const XDataType*>(p_x),
+                                          static_cast<const GammaDataType*>(p_gamma),
+                                          static_cast<const BetaDataType*>(p_beta),
+                                          static_cast<YDataType*>(p_y));
+    };
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceNormalizationSplitKImpl<" << BlockSize << ",";
+        str << "Cluster_MK_" << MThreadClusterSize << "_" << KThreadClusterSize << ",";
+        str << "Slice_MK_" << MThreadSliceSize << "_" << KThreadSliceSize << ",";
+        str << "XYSrcVectorDim_" << XYVectorDim  << ",";
+        str << "VectorSize_X" << XSrcVectorSize << "_Gamma" << GammaSrcVectorSize << "_Beta" << BetaSrcVectorSize << "_Y" << YDstVectorSize << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp
similarity index 100%
rename from include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp
rename to include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_selector.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp
similarity index 98%
rename from include/ck/tensor_operation/gpu/grid/gridwise_normalization_selector.hpp
rename to include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp
index 37795fa56..632690e1e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include "ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp"
+#include "ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp"
+#include "ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp"
 
 namespace ck {
 template <typename GridwiseReduction,
diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp
new file mode 100644
index 000000000..129b4e116
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/math.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename XDataType,
+          typename ComputeDataType,
+          typename MeanVarDataType,
+          typename XGridDesc_M_K,
+          typename MeanVarGridDesc_M_KBlock,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XSrcVectorDim,
+          index_t XSrcVectorSize>
+struct GridwiseNormalizationSplitK1st
+{
+    static_assert((XSrcVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
+                      (XSrcVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (XSrcVectorDim == 0);
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadBufferLengths_M_K                = Sequence<MThreadSliceSize, XSrcVectorSize>;
+    static constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{}));
+
+    using ThreadBufferLengths_M_1 = Sequence<MThreadSliceSize, 1>;
+    static constexpr auto thread_buffer_desc_m_1 =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}, I1));
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelford<ComputeDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M>;
+
+    using BlockwiseWelford = BlockwiseWelford<ComputeDataType,
+                                              BlockSize,
+                                              ThreadClusterLengths_M_K,
+                                              ThreadClusterArrangeOrder,
+                                              false>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr index_t M_BlockTileSize     = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize     = KThreadClusterSize * KThreadSliceSize;
+    static constexpr index_t K_BlockTileStepSize = KThreadClusterSize * XSrcVectorSize;
+
+    static constexpr auto ThreadBufferNumber = Number<KThreadSliceSize / XSrcVectorSize>{};
+
+    __device__ static int
+    GetKPerThread(int kRaw, int kGridSize, int block_k_cluster_id, int thread_k_cluster_id)
+    {
+        bool is_rightmost_block = block_k_cluster_id == kGridSize - 1;
+
+        if(is_rightmost_block)
+        {
+            int left_kPerBlock = math::integer_divide_ceil(kRaw, kGridSize);
+            int kPerBlock      = kRaw % kGridSize == 0 ? left_kPerBlock : kRaw % left_kPerBlock;
+            int kPerThread =
+                kPerBlock < K_BlockTileSize ? 0 : KThreadSliceSize * (kPerBlock / K_BlockTileSize);
+            int kPerBlockTail = kPerBlock - kPerThread * KThreadClusterSize;
+
+            if(kPerBlockTail > 0)
+            {
+                static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                    int thread_max_len =
+                        (thread_k_cluster_id + 1) * XSrcVectorSize + K_BlockTileStepSize * i;
+                    int delta = thread_max_len - kPerBlockTail;
+                    delta     = math::clamp(thread_max_len - kPerBlockTail, 0, XSrcVectorSize);
+                    kPerThread += XSrcVectorSize - delta;
+                });
+            }
+
+            return kPerThread;
+        }
+        else
+        {
+            int kPerBlock = math::integer_divide_ceil(kRaw, kGridSize);
+            return KThreadSliceSize * (kPerBlock / K_BlockTileSize);
+        }
+    }
+
+    // Calculate mean and variance by welford along k dimension
+    __device__ static void Run(const XGridDesc_M_K& x_grid_desc_m_k,
+                               const MeanVarGridDesc_M_KBlock& mean_var_grid_desc_m_kblock,
+                               index_t num_k_block_tile_iteration,
+                               const XDataType* const __restrict__ p_x_global,
+                               MeanVarDataType* const p_mean_global,
+                               MeanVarDataType* const p_variance_global,
+                               int32_t* const p_welford_count_global)
+    {
+        auto x_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    ComputeDataType,
+                                    MThreadSliceSize * XSrcVectorSize,
+                                    true>{};
+            },
+            Number<ThreadBufferNumber>{});
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
+            mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
+            var_thread_buf;
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+
+        const index_t k_grid_size        = mean_var_grid_desc_m_kblock.GetLength(I1);
+        const index_t block_m_cluster_id = block_global_id / k_grid_size;
+        const index_t block_k_cluster_id = block_global_id % k_grid_size;
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  ComputeDataType,
+                                                                  XGridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XSrcVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(
+                block_m_cluster_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                block_k_cluster_id * reduceSizePerBlock + thread_k_cluster_id * XSrcVectorSize));
+
+        auto mean_var_count_store_index = make_multi_index(
+            block_m_cluster_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+            block_k_cluster_id);
+
+        auto threadwise_welford_mean_var_store =
+            ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
+                                               MeanVarDataType,
+                                               decltype(thread_buffer_desc_m_1),
+                                               MeanVarGridDesc_M_KBlock,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M_1,
+                                               Sequence<0, 1>,
+                                               1,
+                                               1,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                mean_var_grid_desc_m_kblock, mean_var_count_store_index, PassThroughOp{});
+
+        constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize);
+
+        const auto x_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x_global, x_grid_desc_m_k.GetElementSpaceSize());
+
+        auto mean_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_mean_global, mean_var_grid_desc_m_kblock.GetElementSpaceSize());
+
+        auto var_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_variance_global, mean_var_grid_desc_m_kblock.GetElementSpaceSize());
+
+        auto threadwise_welford = ThreadwiseWelford();
+        int kRaw                = x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0];
+        threadwise_welford.max_count_ =
+            GetKPerThread(kRaw, k_grid_size, block_k_cluster_id, thread_k_cluster_id);
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            mean_thread_buf(I) = type_convert<ComputeDataType>(0.0f);
+            var_thread_buf(I)  = type_convert<ComputeDataType>(0.0f);
+        });
+
+        for(index_t k = 0; k < num_k_block_tile_iteration; ++k)
+        {
+            static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_x_load.Run(x_grid_desc_m_k,
+                                      x_global_val_buf,
+                                      thread_buffer_desc_m_k,
+                                      make_tuple(I0, I0),
+                                      x_thread_buf(i));
+                threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+                threadwise_welford.Run(x_thread_buf[i], mean_thread_buf, var_thread_buf);
+            });
+        }
+
+        int welford_count = 0;
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+
+            int count = threadwise_welford.cur_count_;
+            BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count);
+
+            // The value of count is same for all I
+            if constexpr(I == MThreadSliceSize - 1)
+                welford_count = count;
+        });
+
+        if(thread_k_cluster_id == 0)
+        {
+            threadwise_welford_mean_var_store.Run(thread_buffer_desc_m_1,
+                                                  make_tuple(I0, I0),
+                                                  mean_thread_buf,
+                                                  mean_var_grid_desc_m_kblock,
+                                                  mean_global_val_buf);
+
+            threadwise_welford_mean_var_store.Run(thread_buffer_desc_m_1,
+                                                  make_tuple(I0, I0),
+                                                  var_thread_buf,
+                                                  mean_var_grid_desc_m_kblock,
+                                                  var_global_val_buf);
+
+            if(block_m_cluster_id == 0 && thread_m_cluster_id == 0)
+                p_welford_count_global[block_k_cluster_id] = welford_count;
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp
new file mode 100644
index 000000000..d796d1afc
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp
@@ -0,0 +1,418 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/math.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename MeanVarDataType,
+          typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename YDataType,
+          typename ComputeDataType,
+          typename YElementwiseOperation,
+          typename MeanVarGridDesc_M_KBlock,
+          typename CountGridDesc_M_KBlock,
+          typename XYGammaBetaGridDesc_M_K,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XSrcVectorDim,
+          index_t XSrcVectorSize,
+          index_t GammaSrcVectorDim,
+          index_t GammaSrcVectorSize,
+          index_t BetaSrcVectorDim,
+          index_t BetaSrcVectorSize,
+          index_t YDstVectorDim,
+          index_t YDstVectorSize>
+struct GridwiseNormalizationSplitK2nd
+{
+    static_assert((XSrcVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
+                      (XSrcVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static_assert((YDstVectorDim == 0 && MThreadSliceSize % YDstVectorSize == 0) ||
+                      (YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static_assert(XSrcVectorSize == YDstVectorSize);
+    static_assert(XSrcVectorSize == GammaSrcVectorSize);
+    static_assert(XSrcVectorSize == BetaSrcVectorSize);
+
+    static constexpr bool reorder_thread_cluster = (XSrcVectorDim == 0);
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadBufferLengths_M_K                = Sequence<MThreadSliceSize, XSrcVectorSize>;
+    static constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{}));
+
+    using ThreadBufferLengths_M_1 = Sequence<MThreadSliceSize, 1>;
+    static constexpr auto thread_buffer_desc_m_1 =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}, I1));
+
+    using ThreadWelfordSrcDesc_M_1 = decltype(thread_buffer_desc_m_1);
+    using ThreadWelfordDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelfordMerge<ComputeDataType, ThreadWelfordSrcDesc_M_1, ThreadWelfordDstDesc_M>;
+
+    using BlockwiseWelford = BlockwiseWelford<ComputeDataType,
+                                              BlockSize,
+                                              ThreadClusterLengths_M_K,
+                                              ThreadClusterArrangeOrder>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr index_t M_BlockTileSize     = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize     = KThreadClusterSize * KThreadSliceSize;
+    static constexpr index_t K_BlockTileStepSize = KThreadClusterSize * XSrcVectorSize;
+
+    static constexpr auto ThreadBufferNumber = Number<KThreadSliceSize / XSrcVectorSize>{};
+
+    __device__ static void Run(const MeanVarGridDesc_M_KBlock& mean_var_grid_desc_m_kblock,
+                               const CountGridDesc_M_KBlock& count_grid_desc_m_kblock,
+                               const XYGammaBetaGridDesc_M_K& x_grid_desc_m_k,
+                               const XYGammaBetaGridDesc_M_K& gamma_grid_desc_m_k,
+                               const XYGammaBetaGridDesc_M_K& beta_grid_desc_m_k,
+                               const XYGammaBetaGridDesc_M_K& y_grid_desc_m_k,
+                               index_t num_k_mean_var_count_iteration,
+                               index_t num_k_block_tile_iteration,
+                               index_t k_grid_size,
+                               ComputeDataType epsilon,
+                               const MeanVarDataType* const p_mean_global,
+                               const MeanVarDataType* const p_variance_global,
+                               const int32_t* const p_welford_count_global,
+                               const XDataType* const __restrict__ p_x_global,
+                               const GammaDataType* const __restrict__ p_gamma_global,
+                               const BetaDataType* const __restrict__ p_beta_global,
+                               YDataType* const __restrict__ p_y_global,
+                               const YElementwiseOperation y_elementwise_op)
+    {
+        // Thread/Block id
+        const index_t thread_local_id    = get_thread_local_1d_id();
+        const index_t block_global_id    = get_block_1d_id();
+        const index_t block_m_cluster_id = block_global_id / k_grid_size;
+        const index_t block_k_cluster_id = block_global_id % k_grid_size;
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        // Global Memory
+        const auto mean_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_mean_global, mean_var_grid_desc_m_kblock.GetElementSpaceSize());
+
+        const auto var_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_variance_global, mean_var_grid_desc_m_kblock.GetElementSpaceSize());
+
+        const auto welford_count_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_welford_count_global, count_grid_desc_m_kblock.GetElementSpaceSize());
+
+        const auto x_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x_global, x_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto gamma_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_gamma_global, gamma_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto beta_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_beta_global, beta_grid_desc_m_k.GetElementSpaceSize());
+
+        auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_y_global, y_grid_desc_m_k.GetElementSpaceSize());
+
+        // VGPR
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
+            in_mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
+            in_var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true>
+            in_welford_count_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
+            mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
+            var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true>
+            welford_count_thread_buf;
+
+        auto x_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    ComputeDataType,
+                                    MThreadSliceSize * XSrcVectorSize,
+                                    true>{};
+            },
+            Number<ThreadBufferNumber>{});
+
+        auto gamma_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    ComputeDataType,
+                                    MThreadSliceSize * GammaSrcVectorSize,
+                                    true>{};
+            },
+            Number<ThreadBufferNumber>{});
+
+        auto& beta_thread_buf = gamma_thread_buf;
+        auto& y_thread_buf    = x_thread_buf;
+
+        // IO
+        auto threadwise_mean_var_load_m_kblock =
+            ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
+                                             ComputeDataType,
+                                             MeanVarGridDesc_M_KBlock,
+                                             decltype(thread_buffer_desc_m_1),
+                                             ThreadBufferLengths_M_1,
+                                             Sequence<0, 1>,
+                                             1,
+                                             1,
+                                             1,
+                                             true>(
+                mean_var_grid_desc_m_kblock,
+                make_multi_index(block_m_cluster_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id));
+
+        auto threadwise_count_load_m_kblock =
+            ThreadwiseTensorSliceTransfer_v2<int32_t,
+                                             int32_t,
+                                             CountGridDesc_M_KBlock,
+                                             decltype(thread_buffer_desc_m_1),
+                                             ThreadBufferLengths_M_1,
+                                             Sequence<0, 1>,
+                                             1,
+                                             1,
+                                             1,
+                                             true>(
+                count_grid_desc_m_kblock,
+                make_multi_index(block_m_cluster_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id));
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  ComputeDataType,
+                                                                  XYGammaBetaGridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XSrcVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(block_m_cluster_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize,
+                             block_k_cluster_id * K_BlockTileSize * num_k_block_tile_iteration +
+                                 thread_k_cluster_id * XSrcVectorSize));
+
+        auto threadwise_gamma_load =
+            ThreadwiseTensorSliceTransfer_v2<GammaDataType,
+                                             ComputeDataType,
+                                             XYGammaBetaGridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_k),
+                                             ThreadBufferLengths_M_K,
+                                             ThreadBufferDimAccessOrder,
+                                             GammaSrcVectorDim,
+                                             GammaSrcVectorSize,
+                                             1,
+                                             true>(
+                gamma_grid_desc_m_k,
+                make_multi_index(block_m_cluster_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 block_k_cluster_id * K_BlockTileSize * num_k_block_tile_iteration +
+                                     thread_k_cluster_id * GammaSrcVectorSize));
+
+        auto threadwise_beta_load =
+            ThreadwiseTensorSliceTransfer_v2<BetaDataType,
+                                             ComputeDataType,
+                                             XYGammaBetaGridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_k),
+                                             ThreadBufferLengths_M_K,
+                                             ThreadBufferDimAccessOrder,
+                                             BetaSrcVectorDim,
+                                             BetaSrcVectorSize,
+                                             1,
+                                             true>(
+                beta_grid_desc_m_k,
+                make_multi_index(block_m_cluster_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 block_k_cluster_id * K_BlockTileSize * num_k_block_tile_iteration +
+                                     thread_k_cluster_id * BetaSrcVectorSize));
+
+        auto threadwise_y_store =
+            ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
+                                               YDataType,
+                                               decltype(thread_buffer_desc_m_k),
+                                               XYGammaBetaGridDesc_M_K,
+                                               YElementwiseOperation,
+                                               ThreadBufferLengths_M_K,
+                                               ThreadBufferDimAccessOrder,
+                                               YDstVectorDim,
+                                               YDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                y_grid_desc_m_k,
+                make_multi_index(block_m_cluster_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 block_k_cluster_id * K_BlockTileSize * num_k_block_tile_iteration +
+                                     thread_k_cluster_id * YDstVectorSize),
+                y_elementwise_op);
+
+        // step1: Merge mean and variance
+        constexpr auto mean_var_count_thread_copy_step_I0_k =
+            make_multi_index(I0, KThreadClusterSize);
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            mean_thread_buf(I)          = type_convert<ComputeDataType>(0.0f);
+            var_thread_buf(I)           = type_convert<ComputeDataType>(0.0f);
+            welford_count_thread_buf(I) = 0;
+        });
+
+        for(index_t k = 0; k < num_k_mean_var_count_iteration; ++k)
+        {
+            threadwise_mean_var_load_m_kblock.Run(mean_var_grid_desc_m_kblock,
+                                                  mean_global_val_buf,
+                                                  thread_buffer_desc_m_1,
+                                                  make_tuple(I0, I0),
+                                                  in_mean_thread_buf);
+
+            threadwise_mean_var_load_m_kblock.Run(mean_var_grid_desc_m_kblock,
+                                                  var_global_val_buf,
+                                                  thread_buffer_desc_m_1,
+                                                  make_tuple(I0, I0),
+                                                  in_var_thread_buf);
+
+            threadwise_count_load_m_kblock.Run(count_grid_desc_m_kblock,
+                                               welford_count_global_val_buf,
+                                               thread_buffer_desc_m_1,
+                                               make_tuple(I0, I0),
+                                               in_welford_count_thread_buf);
+
+            ThreadwiseWelford::Run(in_mean_thread_buf,
+                                   in_var_thread_buf,
+                                   in_welford_count_thread_buf,
+                                   mean_thread_buf,
+                                   var_thread_buf,
+                                   welford_count_thread_buf);
+
+            threadwise_mean_var_load_m_kblock.MoveSrcSliceWindow(
+                mean_var_grid_desc_m_kblock, mean_var_count_thread_copy_step_I0_k);
+            threadwise_count_load_m_kblock.MoveSrcSliceWindow(count_grid_desc_m_kblock,
+                                                              mean_var_count_thread_copy_step_I0_k);
+        }
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+
+            BlockwiseWelford::Run(
+                mean_thread_buf(I), var_thread_buf(I), welford_count_thread_buf(I));
+        });
+
+        // step2: normalization
+        constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize);
+
+        for(index_t k = 0; k < num_k_block_tile_iteration; ++k)
+        {
+            static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_x_load.Run(x_grid_desc_m_k,
+                                      x_global_val_buf,
+                                      thread_buffer_desc_m_k,
+                                      make_tuple(I0, I0),
+                                      x_thread_buf(i));
+                threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+            });
+
+            static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_gamma_load.Run(gamma_grid_desc_m_k,
+                                          gamma_global_val_buf,
+                                          thread_buffer_desc_m_k,
+                                          make_tuple(I0, I0),
+                                          gamma_thread_buf(i));
+
+                threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k,
+                                                         thread_copy_fwd_step_m_k);
+            });
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon);
+                static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
+                    static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
+                        constexpr auto offset_m_k =
+                            thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1));
+
+                        // normalize
+                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                            (x_thread_buf(iK0)(Number<offset_m_k>{}) - mean_thread_buf(iM)) *
+                            divisor;
+
+                        // gamma
+                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                            y_thread_buf(iK0)(Number<offset_m_k>{}) *
+                            gamma_thread_buf(iK0)(Number<offset_m_k>{});
+                    });
+                });
+            });
+
+            static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_beta_load.Run(beta_grid_desc_m_k,
+                                         beta_global_val_buf,
+                                         thread_buffer_desc_m_k,
+                                         make_tuple(I0, I0),
+                                         beta_thread_buf(i));
+                threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k,
+                                                        thread_copy_fwd_step_m_k);
+            });
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
+                    static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
+                        constexpr auto offset_m_k =
+                            thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1));
+
+                        // beta
+                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                            y_thread_buf(iK0)(Number<offset_m_k>{}) +
+                            beta_thread_buf(iK0)(Number<offset_m_k>{});
+                    });
+                });
+            });
+
+            static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_y_store.Run(thread_buffer_desc_m_k,
+                                       make_tuple(I0, I0),
+                                       y_thread_buf(i),
+                                       y_grid_desc_m_k,
+                                       y_global_val_buf);
+                threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_fwd_step_m_k);
+            });
+        } // end for (normalization)
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp
similarity index 100%
rename from include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp
rename to include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp
-- 
GitLab


From 642d5e9155a16c96b01eee7b8ef0e9d558fc2e16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <bartlomiejkocot98@gmail.com>
Date: Mon, 15 May 2023 16:46:52 +0200
Subject: [PATCH 38/71] Add contraction profiler and tests (#701)

* Add contraction profiler and tests

* Build and style fixes

* Allow to use any elementwise operator for ref_contraction

* Introduce profile_contraction_scale and profile_contraction_bilinear

* Make ref_contraction generic and extend interface tests

* Stylistic minor fixes

* Extend test_contraction_interface
---
 .../contraction_bilinear_xdl_fp32.cpp         | 168 +--------
 .../contraction_bilinear_xdl_fp64.cpp         | 168 +--------
 .../contraction_scale_xdl_fp32.cpp            | 169 +--------
 .../contraction_scale_xdl_fp64.cpp            | 169 +--------
 .../cpu/reference_contraction.hpp             | 146 ++++++++
 profiler/README.md                            |  30 ++
 .../profiler/profile_contraction_impl.hpp     | 345 ++++++++++++++++++
 .../profiler/profile_contraction_utils.hpp    |  51 +++
 profiler/src/CMakeLists.txt                   |   4 +
 profiler/src/profile_contraction_bilinear.cpp | 165 +++++++++
 profiler/src/profile_contraction_scale.cpp    | 162 ++++++++
 test/CMakeLists.txt                           |   1 +
 test/contraction/CMakeLists.txt               |   4 +
 test/contraction/test_contraction.cpp         | 138 +++++++
 .../test_contraction_interface.cpp            | 195 ++++++++++
 15 files changed, 1311 insertions(+), 604 deletions(-)
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp
 create mode 100644 profiler/include/profiler/profile_contraction_impl.hpp
 create mode 100644 profiler/include/profiler/profile_contraction_utils.hpp
 create mode 100644 profiler/src/profile_contraction_bilinear.cpp
 create mode 100644 profiler/src/profile_contraction_scale.cpp
 create mode 100644 test/contraction/CMakeLists.txt
 create mode 100644 test/contraction/test_contraction.cpp
 create mode 100644 test/contraction/test_contraction_interface.cpp

diff --git a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
index ea105e4ff..6004db6e0 100644
--- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -74,141 +75,6 @@ using DeviceOpInstanceMNNN = ck::tensor_operation::device::
 
 using DeviceOpInstance = DeviceOpInstanceKKNN;
 
-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-
-                AccDataType v_acc = 0;
-
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-
-                        v_acc += v_a * v_b;
-                    }
-                }
-
-                AccDataType v_c;
-
-                arg.cde_element_op_(v_c, v_acc);
-
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
 int main(int argc, char* argv[])
 {
     bool do_verification = true;
@@ -385,22 +251,22 @@ int main(int argc, char* argv[])
     {
         Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
 
-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
-                                                                  NumDimN,
-                                                                  NumDimK,
-                                                                  ADataType,
-                                                                  BDataType,
-                                                                  CShuffleDataType,
-                                                                  AccDataType,
-                                                                  AElementOp,
-                                                                  BElementOp,
-                                                                  PassThrough>;
-
-        auto ref_gemm    = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-
-        auto ref_argument = ref_gemm.MakeArgument(
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        using ReferenceOpInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                      NumDimN,
+                                                                      NumDimK,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      CShuffleDataType,
+                                                                      AccDataType,
+                                                                      AElementOp,
+                                                                      BElementOp>;
+
+        auto ref_op      = ReferenceOpInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();
+
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);
 
         ref_invoker.Run(ref_argument);
 
diff --git a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
index 9a000377b..9576ce3f2 100644
--- a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -74,141 +75,6 @@ using DeviceOpInstanceMNNN = ck::tensor_operation::device::
 
 using DeviceOpInstance = DeviceOpInstanceKKNN;
 
-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-
-                AccDataType v_acc = 0;
-
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-
-                        v_acc += v_a * v_b;
-                    }
-                }
-
-                AccDataType v_c;
-
-                arg.cde_element_op_(v_c, v_acc);
-
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
 int main(int argc, char* argv[])
 {
     bool do_verification = true;
@@ -385,22 +251,22 @@ int main(int argc, char* argv[])
     {
         Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
 
-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
-                                                                  NumDimN,
-                                                                  NumDimK,
-                                                                  ADataType,
-                                                                  BDataType,
-                                                                  CShuffleDataType,
-                                                                  AccDataType,
-                                                                  AElementOp,
-                                                                  BElementOp,
-                                                                  PassThrough>;
-
-        auto ref_gemm    = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-
-        auto ref_argument = ref_gemm.MakeArgument(
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        using ReferenceOpInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                      NumDimN,
+                                                                      NumDimK,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      CShuffleDataType,
+                                                                      AccDataType,
+                                                                      AElementOp,
+                                                                      BElementOp>;
+
+        auto ref_op      = ReferenceOpInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();
+
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);
 
         ref_invoker.Run(ref_argument);
 
diff --git a/example/26_contraction/contraction_scale_xdl_fp32.cpp b/example/26_contraction/contraction_scale_xdl_fp32.cpp
index 26f176b05..3aa2a7ba9 100644
--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -73,141 +74,6 @@ using DeviceOpInstanceMNN = ck::tensor_operation::device::
 
 using DeviceOpInstance = DeviceOpInstanceKKN;
 
-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-
-                AccDataType v_acc = 0;
-
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-
-                        v_acc += v_a * v_b;
-                    }
-                }
-
-                AccDataType v_c;
-
-                arg.cde_element_op_(v_c, v_acc);
-
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
 int main(int argc, char* argv[])
 {
     bool do_verification = true;
@@ -368,22 +234,23 @@ int main(int argc, char* argv[])
     {
         Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
 
-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
-                                                                  NumDimN,
-                                                                  NumDimK,
-                                                                  ADataType,
-                                                                  BDataType,
-                                                                  CShuffleDataType,
-                                                                  AccDataType,
-                                                                  AElementOp,
-                                                                  BElementOp,
-                                                                  PassThrough>;
-
-        auto ref_gemm    = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-
-        auto ref_argument = ref_gemm.MakeArgument(
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        using ReferenceOpInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                      NumDimN,
+                                                                      NumDimK,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      CShuffleDataType,
+                                                                      AccDataType,
+                                                                      AElementOp,
+                                                                      BElementOp>;
+
+        auto ref_op      = ReferenceOpInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();
+
+        Tensor<float> empty_tensor(std::vector<ck::index_t>{}, std::vector<ck::index_t>{});
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);
 
         ref_invoker.Run(ref_argument);
 
diff --git a/example/26_contraction/contraction_scale_xdl_fp64.cpp b/example/26_contraction/contraction_scale_xdl_fp64.cpp
index 38ed60266..cccf6505c 100644
--- a/example/26_contraction/contraction_scale_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -73,141 +74,6 @@ using DeviceOpInstanceMNN = ck::tensor_operation::device::
 
 using DeviceOpInstance = DeviceOpInstanceKKN;
 
-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-
-                AccDataType v_acc = 0;
-
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-
-                        v_acc += v_a * v_b;
-                    }
-                }
-
-                AccDataType v_c;
-
-                arg.cde_element_op_(v_c, v_acc);
-
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
 int main(int argc, char* argv[])
 {
     bool do_verification = true;
@@ -368,22 +234,23 @@ int main(int argc, char* argv[])
     {
         Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
 
-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
-                                                                  NumDimN,
-                                                                  NumDimK,
-                                                                  ADataType,
-                                                                  BDataType,
-                                                                  CShuffleDataType,
-                                                                  AccDataType,
-                                                                  AElementOp,
-                                                                  BElementOp,
-                                                                  PassThrough>;
-
-        auto ref_gemm    = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-
-        auto ref_argument = ref_gemm.MakeArgument(
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        using ReferenceOpInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                      NumDimN,
+                                                                      NumDimK,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      CShuffleDataType,
+                                                                      AccDataType,
+                                                                      AElementOp,
+                                                                      BElementOp>;
+
+        auto ref_op      = ReferenceOpInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();
+
+        Tensor<float> empty_tensor(std::vector<ck::index_t>{}, std::vector<ck::index_t>{});
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);
 
         ref_invoker.Run(ref_argument);
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp
new file mode 100644
index 000000000..92a8c82a6
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
+struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_ms_ks,
+                 const Tensor<BDataType>& b_ns_ks,
+                 Tensor<CDataType>& c_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op)
+            : a_ms_ks_{a_ms_ks},
+              b_ns_ks_{b_ns_ks},
+              c_ms_ns_{c_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_ms_ks_;
+        const Tensor<BDataType>& b_ns_ks_;
+        Tensor<CDataType>& c_ms_ns_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_M2_N2_K2::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
+                const ck::index_t K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
+                const ck::index_t K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
+
+                AccDataType v_acc = 0;
+
+                for(ck::index_t k0 = 0; k0 < K0; ++k0)
+                {
+                    for(ck::index_t k1 = 0; k1 < K1; ++k1)
+                    {
+                        AccDataType v_a;
+                        AccDataType v_b;
+
+                        arg.a_element_op_(
+                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
+                        arg.b_element_op_(
+                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
+
+                        v_acc += v_a * v_b;
+                    }
+                }
+
+                arg.c_ms_ns_(m0, m1, n0, n1) = v_acc;
+            };
+
+            make_ParallelTensorFunctor(f_ms_ns,
+                                       arg.c_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.c_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.c_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.c_ms_ns_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
+                             const Tensor<BDataType>& b_ns_ks,
+                             Tensor<CDataType>& c_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op)
+    {
+        return Argument{a_ms_ks, b_ns_ks, c_ms_ns, a_element_op, b_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceContraction_M2_N2_K2"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/README.md b/profiler/README.md
index bfd6a3a53..400a64a39 100644
--- a/profiler/README.md
+++ b/profiler/README.md
@@ -46,3 +46,33 @@ out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
 ....
 Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s
 ```
+
+## Profile contraction kernels
+```bash
+#arg1: tensor operation (contraction_bilinear=CONTRACTION+Bilinear)
+#arg2: data type (0: fp32; 1: f64)\n"
+#arg3: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
+#                     1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
+#                     2: A[k0, k1, m0, m1] * B[k0, k1, n0, n1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
+#                     3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1])
+#arg4: verification (0: no; 1: yes)
+#arg5: initialization (0: no init; 1: integer value; 2: decimal value)
+#arg6: print tensor value (0: no; 1: yes)
+#arg7: time kernel (0: no, 1: yes)
+#arg8 and arg9: alpha and beta
+#arg10 to 15: M0, M1, N0, N1, K0, K1
+#arg16 to 31: Strides for A, B, D and E (skip for default)
+
+################                   op  datatype  layout  verify  init  log  time  alpha  beta  M0  M1  N0  N1  K0  K1
+./bin/ckProfiler contraction_bilinear         0       1       0     0    0     1    1.0   1.0 128 128 128 128 128 128
+```
+
+Result (MI100)
+```bash
+a_m_k: dim 4, lengths {128, 128, 128, 128}, strides {2097152, 16384, 128, 1}
+b_k_n: dim 4, lengths {128, 128, 128, 128}, strides {128, 1, 2097152, 16384}
+d_m_n: dim 4, lengths {128, 128, 128, 128}, strides {2097152, 16384, 128, 1}
+e_m_n: dim 4, lengths {128, 128, 128, 128}, strides {2097152, 16384, 128, 1}
+....
+Best Perf: 211.405 ms, 41.6077 TFlops, 15.2372 GB/s
+```
diff --git a/profiler/include/profiler/profile_contraction_impl.hpp b/profiler/include/profiler/profile_contraction_impl.hpp
new file mode 100644
index 000000000..660cc3f9e
--- /dev/null
+++ b/profiler/include/profiler/profile_contraction_impl.hpp
@@ -0,0 +1,345 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+#include <limits>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
+
+#include "ck/host_utility/io.hpp"
+
+namespace ck {
+namespace profiler {
+
+using Bilinear = ck::tensor_operation::element_wise::Bilinear;
+using Scale    = ck::tensor_operation::element_wise::Scale;
+
+template <typename ALayout,
+          typename BLayout,
+          typename CDELayout,
+          typename DataType,
+          typename DTupleDataType,
+          typename CDElementOp>
+int profile_contraction_impl(ck::index_t do_verification,
+                             ck::index_t init_method,
+                             bool do_log,
+                             bool time_kernel,
+                             CDElementOp cde_element_op,
+                             const std::vector<ck::index_t>& M,
+                             const std::vector<ck::index_t>& N,
+                             const std::vector<ck::index_t>& K,
+                             const std::vector<ck::index_t>& StridesA,
+                             const std::vector<ck::index_t>& StridesB,
+                             const std::vector<ck::index_t>& StridesE,
+                             const std::vector<ck::index_t>& StridesD)
+{
+    bool pass = true;
+
+    auto f_host_tensor_descriptor = [](const std::vector<ck::index_t>& dims01,
+                                       const std::vector<ck::index_t>& dims23,
+                                       const std::vector<ck::index_t>& strides) {
+        std::vector<std::size_t> dims_szt(dims01.begin(), dims01.end());
+        dims_szt.insert(dims_szt.end(), dims23.begin(), dims23.end());
+        std::vector<std::size_t> strides_szt(strides.begin(), strides.end());
+
+        return HostTensorDescriptor(dims_szt, strides);
+    };
+
+    Tensor<DataType> a_m_k(f_host_tensor_descriptor(M, K, StridesA));
+    Tensor<DataType> b_k_n(f_host_tensor_descriptor(K, N, StridesB));
+    Tensor<DataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StridesE));
+    Tensor<DataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StridesE));
+    Tensor<DataType> d_m_n(f_host_tensor_descriptor(M, N, StridesD));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<DataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<DataType>{-5, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<DataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<DataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<DataType>{-0.5, 0.5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<DataType>{-0.5, 0.5});
+    }
+
+    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    DeviceMem a_device_buf(sizeof(DataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(DataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(DataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DataType) * d_m_n.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    e_device_buf.SetZero();
+    d_device_buf.ToDevice(d_m_n.mData.data());
+
+    const std::vector<index_t> a_ms_ks_lengths = {M[0], M[1], K[0], K[1]};
+    const std::vector<index_t> b_ns_ks_lengths = {N[0], N[1], K[0], K[1]};
+    const std::vector<index_t> e_ms_ns_lengths = {M[0], M[1], N[0], N[1]};
+    const std::vector<index_t> d_m_n_lengths   = {M[0], M[1], N[0], N[1]};
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+
+    constexpr ck::index_t NumDim = 2;
+    using DeviceOp               = ck::tensor_operation::device::DeviceContractionMultipleD<NumDim,
+                                                                              NumDim,
+                                                                              NumDim,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DTupleDataType,
+                                                                              DataType,
+                                                                              AElementOp,
+                                                                              BElementOp,
+                                                                              CDElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // Run reference op
+    if(do_verification)
+    {
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDim,
+                                                                      NumDim,
+                                                                      NumDim,
+                                                                      DataType,
+                                                                      DataType,
+                                                                      DataType,
+                                                                      DataType,
+                                                                      AElementOp,
+                                                                      BElementOp>;
+
+        auto ref_op      = ReferenceGemmInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();
+
+        Tensor<DataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StridesE));
+
+        auto ref_argument =
+            ref_op.MakeArgument(a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        for(size_t m0 = 0; m0 < e_m_n_host_result.mDesc.GetLengths()[0]; ++m0)
+        {
+            for(size_t m1 = 0; m1 < e_m_n_host_result.mDesc.GetLengths()[1]; ++m1)
+            {
+                for(size_t n0 = 0; n0 < e_m_n_host_result.mDesc.GetLengths()[2]; ++n0)
+                {
+                    for(size_t n1 = 0; n1 < e_m_n_host_result.mDesc.GetLengths()[3]; ++n1)
+                    {
+                        if constexpr(is_same<CDElementOp, Bilinear>::value)
+                        {
+                            cde_element_op(e_m_n_host_result(m0, m1, n0, n1),
+                                           c_m_n_host_result(m0, m1, n0, n1),
+                                           d_m_n(m0, m1, n0, n1));
+                        }
+                        else if constexpr(is_same<CDElementOp, Scale>::value)
+                        {
+                            cde_element_op(e_m_n_host_result(m0, m1, n0, n1),
+                                           c_m_n_host_result(m0, m1, n0, n1));
+                        }
+                        else
+                        {
+                            static_assert("Unsupported CDElementOp in contraction profiler.");
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    std::string best_op_name;
+    float best_avg_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        std::unique_ptr<tensor_operation::device::BaseArgument> argument_ptr;
+        if constexpr(is_same<CDElementOp, Bilinear>::value)
+        {
+            argument_ptr = op_ptr->MakeArgumentPointer(
+                static_cast<DataType*>(a_device_buf.GetDeviceBuffer()),
+                static_cast<DataType*>(b_device_buf.GetDeviceBuffer()),
+                std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                static_cast<DataType*>(e_device_buf.GetDeviceBuffer()),
+                a_ms_ks_lengths,
+                StridesA,
+                b_ns_ks_lengths,
+                StridesB,
+                std::array<std::vector<ck::index_t>, 1>{d_m_n_lengths},
+                std::array<std::vector<ck::index_t>, 1>{StridesD},
+                e_ms_ns_lengths,
+                StridesE,
+                a_element_op,
+                b_element_op,
+                cde_element_op);
+        }
+        else if constexpr(is_same<CDElementOp, Scale>::value)
+        {
+            argument_ptr =
+                op_ptr->MakeArgumentPointer(static_cast<DataType*>(a_device_buf.GetDeviceBuffer()),
+                                            static_cast<DataType*>(b_device_buf.GetDeviceBuffer()),
+                                            std::array<const void*, 0>{},
+                                            static_cast<DataType*>(e_device_buf.GetDeviceBuffer()),
+                                            a_ms_ks_lengths,
+                                            StridesA,
+                                            b_ns_ks_lengths,
+                                            StridesB,
+                                            std::array<std::vector<ck::index_t>, 0>{},
+                                            std::array<std::vector<ck::index_t>, 0>{},
+                                            e_ms_ns_lengths,
+                                            StridesE,
+                                            a_element_op,
+                                            b_element_op,
+                                            cde_element_op);
+        }
+        else
+        {
+            static_assert("Unsupported CDElementOp in contraction profiler.");
+        }
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        auto nelems_m = M[0] * M[1];
+        auto nelems_n = N[0] * N[1];
+        auto nelems_k = K[0] * K[1];
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init C to zero before profiling next kernel
+            e_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * nelems_m * nelems_n * nelems_k;
+
+            std::size_t num_btype = sizeof(DataType) * nelems_m * nelems_k +
+                                    sizeof(DataType) * nelems_k * nelems_n +
+                                    sizeof(DataType) * nelems_m * nelems_n;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
+
+            float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+                float threshold =
+                    static_cast<DataType>(nelems_k) * std::numeric_limits<DataType>::epsilon();
+                pass = pass & ck::utils::check_err(e_m_n_device_result,
+                                                   e_m_n_host_result,
+                                                   "Error: incorrect results!",
+                                                   threshold,
+                                                   threshold);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host  : ", e_m_n_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_device: ", e_m_n_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    if constexpr(is_same<DataType, float>::value)
+    {
+        std::cout << "Best Perf for datatype = f32";
+    }
+    else if constexpr(is_same<DataType, double>::value)
+    {
+        std::cout << "Best Perf for datatype = f64";
+    }
+
+    if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " ALayout =  RowMajor";
+    }
+    else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " ALayout =  ColumnMajor";
+    }
+
+    if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " BLayout =  RowMajor";
+    }
+    else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " BLayout =  ColumnMajor";
+    }
+
+    if constexpr(is_same<CDELayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " CDELayout =  RowMajor";
+    }
+    else if constexpr(is_same<CDELayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " CDELayout =  ColumnMajor";
+    }
+
+    std::cout << " M = " << M << " N = " << N << " K = " << K << " StridesA = " << StridesA
+              << " StridesB = " << StridesB << " StridesE = " << StridesE << " : " << best_avg_time
+              << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_contraction_utils.hpp b/profiler/include/profiler/profile_contraction_utils.hpp
new file mode 100644
index 000000000..076bbd455
--- /dev/null
+++ b/profiler/include/profiler/profile_contraction_utils.hpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/ck.hpp"
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using Bilinear = ck::tensor_operation::element_wise::Bilinear;
+using Scale    = ck::tensor_operation::element_wise::Scale;
+
+enum struct ContractionMatrixLayout
+{
+    MK_KN_MN_MN, // 0
+    MK_NK_MN_MN, // 1
+    KM_KN_MN_MN, // 2
+    KM_NK_MN_MN, // 3
+};
+
+enum struct ContractionDataType
+{
+    F32_F32_F32_F32, // 0
+    F64_F64_F64_F64, // 1
+};
+
+inline void collect_index_params(char* argv[],
+                                 std::vector<ck::index_t>& params,
+                                 const ck::index_t from,
+                                 const ck::index_t num)
+{
+    for(ck::index_t p = from; p < from + num; p++)
+        params.push_back(std::stoi(argv[p]));
+}
+
+// Defualt strides for row-major: {Dim1 * Dim2 * Dim3, Dim2 * Dim3, Dim3, 1}
+// Defualt strides for column-major: {Dim1, 1, Dim0 * Dim1 * Dim3, Dim0 * Dim1}
+inline void
+assign_default_strides(Row, std::vector<ck::index_t>& strides, std::vector<ck::index_t> dims)
+{
+    strides = {dims[1] * dims[2] * dims[3], dims[2] * dims[3], dims[3], 1};
+}
+
+inline void
+assign_default_strides(Col, std::vector<ck::index_t>& strides, std::vector<ck::index_t> dims)
+{
+    strides = {dims[1], 1, dims[0] * dims[1] * dims[3], dims[0] * dims[1]};
+}
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index c21fff7de..0a50eedb7 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -30,6 +30,8 @@ set(PROFILER_SOURCES
     profile_batchnorm_bwd.cpp
     profile_batchnorm_infer.cpp
     profile_grouped_gemm_fastgelu.cpp
+    profile_contraction_bilinear.cpp
+    profile_contraction_scale.cpp
 )
 
 set(PROFILER_EXECUTABLE ckProfiler)
@@ -70,4 +72,6 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
diff --git a/profiler/src/profile_contraction_bilinear.cpp b/profiler/src/profile_contraction_bilinear.cpp
new file mode 100644
index 000000000..6ed184120
--- /dev/null
+++ b/profiler/src/profile_contraction_bilinear.cpp
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <vector>
+
+#include "profiler/profile_contraction_impl.hpp"
+#include "profiler/profile_contraction_utils.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "contraction_bilinear"
+#define OP_DESC "CONTRACTION+Bilinear"
+
+static void print_helper_msg()
+{
+    std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+              << "arg2: data type (0: fp32; 1: f64)\n"
+              << "arg3: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + "
+                 "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
+              << "                     1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + "
+                 "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
+              << "                     2: A[k0, k1, m0, m1] * B[k0, k1, n0, n1] + "
+                 "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
+              << "                     3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + "
+                 "D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n"
+              << "arg4: verification (0: no; 1: yes)\n"
+              << "arg5: initialization (0: no init; 1: integer value; 2: decimal "
+              << "value)\n"
+              << "arg6: print tensor value (0: no; 1: yes)\n"
+              << "arg7: time kernel (0: no, 1: yes)\n"
+              << "arg8 and arg9: alpha and beta\n"
+              << "arg10 to 15: M0, M1, N0, N1, K0, K1\n"
+              << "arg16 to 31: Strides for A, B, D and E (skip for default)\n"
+              << std::endl;
+}
+
+int profile_contraction_bilinear(int argc, char* argv[])
+{
+    const bool default_strides = argc == 16;
+
+    if(argc != 32 && argc != 16)
+    {
+        print_helper_msg();
+        exit(1);
+    }
+
+    const auto data_type          = static_cast<ContractionDataType>(std::stoi(argv[2]));
+    const auto layout             = static_cast<ContractionMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification    = std::stoi(argv[4]);
+    const ck::index_t init_method = std::stoi(argv[5]);
+    const bool do_log             = std::stoi(argv[6]);
+    const bool time_kernel        = std::stoi(argv[7]);
+    const float alpha             = std::stof(argv[8]);
+    const float beta              = std::stof(argv[9]);
+
+    std::vector<ck::index_t> M;
+    std::vector<ck::index_t> N;
+    std::vector<ck::index_t> K;
+    const ck::index_t dims_arg_num = 10;
+    collect_index_params(argv, M, dims_arg_num, 2);
+    collect_index_params(argv, N, dims_arg_num + 2, 2);
+    collect_index_params(argv, K, dims_arg_num + 4, 2);
+
+    std::vector<ck::index_t> StridesA;
+    std::vector<ck::index_t> StridesB;
+    std::vector<ck::index_t> StridesE;
+    std::vector<ck::index_t> StridesD;
+    if(!default_strides)
+    {
+        collect_index_params(argv, StridesA, dims_arg_num + 6, 4);
+        collect_index_params(argv, StridesB, dims_arg_num + 10, 4);
+        collect_index_params(argv, StridesE, dims_arg_num + 14, 4);
+        collect_index_params(argv, StridesD, dims_arg_num + 18, 4);
+    }
+
+    using F32 = float;
+    using F64 = double;
+
+    auto profile = [&](auto a_layout, auto b_layout, auto cde_layout, auto type) {
+        using ALayout   = decltype(a_layout);
+        using BLayout   = decltype(b_layout);
+        using CDELayout = decltype(cde_layout);
+
+        using DataType = decltype(type);
+
+        if(default_strides)
+        {
+            assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]});
+            assign_default_strides(b_layout, StridesB, {K[0], K[1], N[0], N[1]});
+            assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]});
+            assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]});
+        }
+        bool pass = ck::profiler::profile_contraction_impl<ALayout,
+                                                           BLayout,
+                                                           CDELayout,
+                                                           DataType,
+                                                           ck::Tuple<DataType>,
+                                                           Bilinear>(do_verification,
+                                                                     init_method,
+                                                                     do_log,
+                                                                     time_kernel,
+                                                                     Bilinear{alpha, beta},
+                                                                     M,
+                                                                     N,
+                                                                     K,
+                                                                     StridesA,
+                                                                     StridesB,
+                                                                     StridesE,
+                                                                     StridesD);
+
+        return pass;
+    };
+
+    if(data_type == ContractionDataType::F32_F32_F32_F32 &&
+       layout == ContractionMatrixLayout::MK_KN_MN_MN)
+    {
+        return profile(Row{}, Row{}, Row{}, F32{});
+    }
+    else if(data_type == ContractionDataType::F32_F32_F32_F32 &&
+            layout == ContractionMatrixLayout::MK_NK_MN_MN)
+    {
+        return profile(Row{}, Col{}, Row{}, F32{});
+    }
+    else if(data_type == ContractionDataType::F32_F32_F32_F32 &&
+            layout == ContractionMatrixLayout::KM_KN_MN_MN)
+    {
+        return profile(Col{}, Row{}, Row{}, F32{});
+    }
+    else if(data_type == ContractionDataType::F32_F32_F32_F32 &&
+            layout == ContractionMatrixLayout::KM_NK_MN_MN)
+    {
+        return profile(Col{}, Col{}, Row{}, F32{});
+    }
+    else if(data_type == ContractionDataType::F64_F64_F64_F64 &&
+            layout == ContractionMatrixLayout::MK_KN_MN_MN)
+    {
+        return profile(Row{}, Row{}, Row{}, F64{});
+    }
+    else if(data_type == ContractionDataType::F64_F64_F64_F64 &&
+            layout == ContractionMatrixLayout::MK_NK_MN_MN)
+    {
+        return profile(Row{}, Col{}, Row{}, F64{});
+    }
+    else if(data_type == ContractionDataType::F64_F64_F64_F64 &&
+            layout == ContractionMatrixLayout::KM_KN_MN_MN)
+    {
+        return profile(Col{}, Row{}, Row{}, F64{});
+    }
+    else if(data_type == ContractionDataType::F64_F64_F64_F64 &&
+            layout == ContractionMatrixLayout::KM_NK_MN_MN)
+    {
+        return profile(Col{}, Col{}, Row{}, F64{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_contraction_bilinear);
diff --git a/profiler/src/profile_contraction_scale.cpp b/profiler/src/profile_contraction_scale.cpp
new file mode 100644
index 000000000..6784b916f
--- /dev/null
+++ b/profiler/src/profile_contraction_scale.cpp
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <vector>
+
+#include "profiler/profile_contraction_impl.hpp"
+#include "profiler/profile_contraction_utils.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "contraction_scale"
+#define OP_DESC "CONTRACTION+Scale"
+
+static void print_helper_msg()
+{
+    std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+              << "arg2: data type (0: fp32; 1: f64)\n"
+              << "arg3: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + "
+                 "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
+              << "                     1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + "
+                 "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
+              << "                     2: A[k0, k1, m0, m1] * B[k0, k1, n0, n1] + "
+                 "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
+              << "                     3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + "
+                 "D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n"
+              << "arg4: verification (0: no; 1: yes)\n"
+              << "arg5: initialization (0: no init; 1: integer value; 2: decimal "
+              << "value)\n"
+              << "arg6: print tensor value (0: no; 1: yes)\n"
+              << "arg7: time kernel (0: no, 1: yes)\n"
+              << "arg8: alpha\n"
+              << "arg9 to 14: M0, M1, N0, N1, K0, K1\n"
+              << "arg15 to 30: Strides for A, B, D and E (skip for default)\n"
+              << std::endl;
+}
+
+int profile_contraction_scale(int argc, char* argv[])
+{
+    const bool default_strides = argc == 15;
+
+    if(argc != 31 && argc != 15)
+    {
+        print_helper_msg();
+        exit(1);
+    }
+
+    const auto data_type          = static_cast<ContractionDataType>(std::stoi(argv[2]));
+    const auto layout             = static_cast<ContractionMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification    = std::stoi(argv[4]);
+    const ck::index_t init_method = std::stoi(argv[5]);
+    const bool do_log             = std::stoi(argv[6]);
+    const bool time_kernel        = std::stoi(argv[7]);
+    const float alpha             = std::stof(argv[8]);
+
+    std::vector<ck::index_t> M;
+    std::vector<ck::index_t> N;
+    std::vector<ck::index_t> K;
+    const ck::index_t dims_arg_num = 9;
+    collect_index_params(argv, M, dims_arg_num, 2);
+    collect_index_params(argv, N, dims_arg_num + 2, 2);
+    collect_index_params(argv, K, dims_arg_num + 4, 2);
+
+    std::vector<ck::index_t> StridesA;
+    std::vector<ck::index_t> StridesB;
+    std::vector<ck::index_t> StridesE;
+    std::vector<ck::index_t> StridesD;
+    if(!default_strides)
+    {
+        collect_index_params(argv, StridesA, dims_arg_num + 6, 4);
+        collect_index_params(argv, StridesB, dims_arg_num + 10, 4);
+        collect_index_params(argv, StridesE, dims_arg_num + 14, 4);
+        collect_index_params(argv, StridesD, dims_arg_num + 18, 4);
+    }
+
+    using F32 = float;
+    using F64 = double;
+
+    auto profile = [&](auto a_layout, auto b_layout, auto cde_layout, auto type) {
+        using ALayout   = decltype(a_layout);
+        using BLayout   = decltype(b_layout);
+        using CDELayout = decltype(cde_layout);
+
+        using DataType = decltype(type);
+
+        if(default_strides)
+        {
+            assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]});
+            assign_default_strides(b_layout, StridesB, {K[0], K[1], N[0], N[1]});
+            assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]});
+            assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]});
+        }
+
+        bool pass = ck::profiler::
+            profile_contraction_impl<ALayout, BLayout, CDELayout, DataType, ck::Tuple<>, Scale>(
+                do_verification,
+                init_method,
+                do_log,
+                time_kernel,
+                Scale{alpha},
+                M,
+                N,
+                K,
+                StridesA,
+                StridesB,
+                StridesE,
+                StridesD);
+
+        return pass;
+    };
+
+    if(data_type == ContractionDataType::F32_F32_F32_F32 &&
+       layout == ContractionMatrixLayout::MK_KN_MN_MN)
+    {
+        return profile(Row{}, Row{}, Row{}, F32{});
+    }
+    else if(data_type == ContractionDataType::F32_F32_F32_F32 &&
+            layout == ContractionMatrixLayout::MK_NK_MN_MN)
+    {
+        return profile(Row{}, Col{}, Row{}, F32{});
+    }
+    else if(data_type == ContractionDataType::F32_F32_F32_F32 &&
+            layout == ContractionMatrixLayout::KM_KN_MN_MN)
+    {
+        return profile(Col{}, Row{}, Row{}, F32{});
+    }
+    else if(data_type == ContractionDataType::F32_F32_F32_F32 &&
+            layout == ContractionMatrixLayout::KM_NK_MN_MN)
+    {
+        return profile(Col{}, Col{}, Row{}, F32{});
+    }
+    else if(data_type == ContractionDataType::F64_F64_F64_F64 &&
+            layout == ContractionMatrixLayout::MK_KN_MN_MN)
+    {
+        return profile(Row{}, Row{}, Row{}, F64{});
+    }
+    else if(data_type == ContractionDataType::F64_F64_F64_F64 &&
+            layout == ContractionMatrixLayout::MK_NK_MN_MN)
+    {
+        return profile(Row{}, Col{}, Row{}, F64{});
+    }
+    else if(data_type == ContractionDataType::F64_F64_F64_F64 &&
+            layout == ContractionMatrixLayout::KM_KN_MN_MN)
+    {
+        return profile(Col{}, Row{}, Row{}, F64{});
+    }
+    else if(data_type == ContractionDataType::F64_F64_F64_F64 &&
+            layout == ContractionMatrixLayout::KM_NK_MN_MN)
+    {
+        return profile(Col{}, Col{}, Row{}, F64{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_contraction_scale);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 6f43e5235..4f212d53a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -56,6 +56,7 @@ add_subdirectory(normalization)
 add_subdirectory(data_type)
 add_subdirectory(elementwise_normalization)
 add_subdirectory(batchnorm)
+add_subdirectory(contraction)
 if(GPU_TARGETS MATCHES "gfx1100")
     add_subdirectory(wmma_op)
 endif()
diff --git a/test/contraction/CMakeLists.txt b/test/contraction/CMakeLists.txt
new file mode 100644
index 000000000..481717000
--- /dev/null
+++ b/test/contraction/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_gtest_executable(test_contraction test_contraction.cpp)
+add_gtest_executable(test_contraction_interface test_contraction_interface.cpp)
+target_link_libraries(test_contraction PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
+target_link_libraries(test_contraction_interface PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
diff --git a/test/contraction/test_contraction.cpp b/test/contraction/test_contraction.cpp
new file mode 100644
index 000000000..c86b84923
--- /dev/null
+++ b/test/contraction/test_contraction.cpp
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <initializer_list>
+#include <vector>
+#include <tuple>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_contraction_impl.hpp"
+
+using F32 = float;
+using F64 = double;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using Bilinear = ck::tensor_operation::element_wise::Bilinear;
+using Scale    = ck::tensor_operation::element_wise::Scale;
+
+struct MemoryParams
+{
+    std::vector<ck::index_t> M;
+    std::vector<ck::index_t> N;
+    std::vector<ck::index_t> K;
+    std::vector<ck::index_t> StridesA;
+    std::vector<ck::index_t> StridesB;
+    std::vector<ck::index_t> StridesC;
+    std::vector<ck::index_t> StridesD;
+};
+
+template <typename Tuple>
+class TestContraction : public ::testing::Test
+{
+    protected:
+    using ALayout        = std::tuple_element_t<0, Tuple>;
+    using BLayout        = std::tuple_element_t<1, Tuple>;
+    using CDLayout       = std::tuple_element_t<2, Tuple>;
+    using DataType       = std::tuple_element_t<3, Tuple>;
+    using DTupleDataType = std::tuple_element_t<4, Tuple>;
+    using CDElementOp    = std::tuple_element_t<5, Tuple>;
+
+    std::vector<MemoryParams> list_of_memory_params = {{{32, 32},
+                                                        {32, 32},
+                                                        {32, 32},
+                                                        {32768, 1024, 32, 1},
+                                                        {32768, 1024, 32, 1},
+                                                        {32768, 1024, 32, 1},
+                                                        {32768, 1024, 32, 1}},
+                                                       {{16, 16},
+                                                        {32, 32},
+                                                        {16, 16},
+                                                        {4096, 256, 16, 1},
+                                                        {16, 1, 8192, 256},
+                                                        {16384, 1024, 32, 1},
+                                                        {16384, 1024, 32, 1}}};
+
+    std::vector<ck::index_t> init_methods = {0, 1, 2};
+    std::unique_ptr<CDElementOp> p_cd_element_op;
+    void Run()
+    {
+        for(auto& memory_params : list_of_memory_params)
+        {
+            for(const ck::index_t init_method : init_methods)
+            {
+                bool pass =
+                    ck::profiler::profile_contraction_impl<ALayout,
+                                                           BLayout,
+                                                           CDLayout,
+                                                           DataType,
+                                                           DTupleDataType,
+                                                           CDElementOp>(true /*do_verification*/,
+                                                                        init_method,
+                                                                        false /*do_logs*/,
+                                                                        false /*time_kernel*/,
+                                                                        *p_cd_element_op,
+                                                                        memory_params.M,
+                                                                        memory_params.N,
+                                                                        memory_params.K,
+                                                                        memory_params.StridesA,
+                                                                        memory_params.StridesB,
+                                                                        memory_params.StridesC,
+                                                                        memory_params.StridesD);
+                EXPECT_TRUE(pass);
+            }
+        }
+    }
+};
+
+template <typename Tuple>
+class TestContractionScale : public TestContraction<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestContractionBilinear : public TestContraction<Tuple>
+{
+};
+
+using BilinearKernelTypes =
+    ::testing::Types<std::tuple<Row, Row, Row, F32, ck::Tuple<F32>, Bilinear>,
+                     std::tuple<Row, Col, Row, F32, ck::Tuple<F32>, Bilinear>,
+                     std::tuple<Col, Row, Row, F32, ck::Tuple<F32>, Bilinear>,
+                     std::tuple<Col, Col, Row, F32, ck::Tuple<F32>, Bilinear>,
+                     std::tuple<Row, Row, Row, F64, ck::Tuple<F32>, Bilinear>,
+                     std::tuple<Row, Col, Row, F64, ck::Tuple<F32>, Bilinear>,
+                     std::tuple<Col, Row, Row, F64, ck::Tuple<F32>, Bilinear>,
+                     std::tuple<Col, Col, Row, F64, ck::Tuple<F32>, Bilinear>>;
+
+using ScaleKernelTypes = ::testing::Types<std::tuple<Row, Row, Row, F32, ck::Tuple<>, Scale>,
+                                          std::tuple<Row, Col, Row, F32, ck::Tuple<>, Scale>,
+                                          std::tuple<Col, Row, Row, F32, ck::Tuple<>, Scale>,
+                                          std::tuple<Col, Col, Row, F32, ck::Tuple<>, Scale>,
+                                          std::tuple<Row, Row, Row, F64, ck::Tuple<>, Scale>,
+                                          std::tuple<Row, Col, Row, F64, ck::Tuple<>, Scale>,
+                                          std::tuple<Col, Row, Row, F64, ck::Tuple<>, Scale>,
+                                          std::tuple<Col, Col, Row, F64, ck::Tuple<>, Scale>>;
+
+TYPED_TEST_SUITE(TestContractionBilinear, BilinearKernelTypes);
+TYPED_TEST_SUITE(TestContractionScale, ScaleKernelTypes);
+
+TYPED_TEST(TestContractionBilinear, bilinear)
+{
+    this->p_cd_element_op = std::make_unique<Bilinear>(1.f, 1.f);
+    this->Run();
+    this->p_cd_element_op = std::make_unique<Bilinear>(-0.5f, 0.5f);
+    this->Run();
+}
+
+TYPED_TEST(TestContractionScale, scale)
+{
+    this->p_cd_element_op = std::make_unique<Scale>(1.f);
+    this->Run();
+    this->p_cd_element_op = std::make_unique<Scale>(0.5f);
+    this->Run();
+}
diff --git a/test/contraction/test_contraction_interface.cpp b/test/contraction/test_contraction_interface.cpp
new file mode 100644
index 000000000..c9e720c59
--- /dev/null
+++ b/test/contraction/test_contraction_interface.cpp
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <stdexcept>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+
+using Pass     = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear = ck::tensor_operation::element_wise::Bilinear;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F32 = float;
+using F64 = double;
+
+template <ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t CDEBlockTransferScalarPerVector>
+class ContractionInstanceWrapper
+{
+    public:
+    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+    static constexpr ck::index_t NumDim = 2;
+    // clang-format off
+    using ContractionDeviceInstance = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|         DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer|             ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer|              BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|                  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|           Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|               SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|               SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl|                 ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |               |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |                           |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |                           |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|                   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |               |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |                           |               |               |          |                |               |               |                           |               |               |          |            |            |                             |                                |
+        DeviceContractionMultipleD_Xdl_CShuffle<  NumDim,  NumDim,  NumDim,   F32,   F32,     F32,      F32, ck::Tuple<F32>,   F32,         Pass,        Pass,     Bilinear,       GemmSpec,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>, ABlockTransferSrcVectorDim,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>, BBlockTransferSrcVectorDim,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>, CDEBlockTransferScalarPerVector>;
+    // clang-format on
+
+    bool isSupported(std::vector<ck::index_t>& ADims,
+                     std::vector<ck::index_t>& BDims,
+                     std::vector<ck::index_t>& DDims,
+                     std::vector<ck::index_t>& EDims,
+                     std::vector<ck::index_t>& AStrides,
+                     std::vector<ck::index_t>& BStrides,
+                     std::vector<ck::index_t>& DStrides,
+                     std::vector<ck::index_t>& EStrides) const
+    {
+        auto contraction = ContractionDeviceInstance{};
+
+        auto argument = contraction.MakeArgument(nullptr,
+                                                 nullptr,
+                                                 std::array<const void*, 1>{nullptr},
+                                                 nullptr,
+                                                 ADims,
+                                                 AStrides,
+                                                 BDims,
+                                                 BStrides,
+                                                 std::array<std::vector<ck::index_t>, 1>{DDims},
+                                                 std::array<std::vector<ck::index_t>, 1>{DStrides},
+                                                 EDims,
+                                                 EStrides,
+                                                 Pass{},
+                                                 Pass{},
+                                                 Bilinear{1.f, 1.f});
+        return contraction.IsSupportedArgument(argument);
+    }
+};
+
+template <typename DataTypeA,
+          typename DataTypeB,
+          typename DataTypeC,
+          typename DataTypeD,
+          ck::index_t NumDim>
+class ContractionDeviceOpWrapper
+{
+
+    protected:
+    using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<NumDim,
+                                                                              NumDim,
+                                                                              NumDim,
+                                                                              DataTypeA,
+                                                                              DataTypeB,
+                                                                              ck::Tuple<DataTypeC>,
+                                                                              DataTypeD,
+                                                                              Pass,
+                                                                              Pass,
+                                                                              Bilinear>;
+
+    public:
+    bool IsSupportedInstance(std::vector<ck::index_t>& Dims,
+                             std::vector<ck::index_t>& Strides) const
+    {
+
+        bool supported     = false;
+        const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+        for(auto& op_ptr : op_ptrs)
+        {
+            auto argument_ptr =
+                op_ptr->MakeArgumentPointer(nullptr,
+                                            nullptr,
+                                            std::array<const void*, 1>{nullptr},
+                                            nullptr,
+                                            Dims,
+                                            Strides,
+                                            Dims,
+                                            Strides,
+                                            std::array<std::vector<ck::index_t>, 1>{Dims},
+                                            std::array<std::vector<ck::index_t>, 1>{Strides},
+                                            Dims,
+                                            Strides,
+                                            Pass{},
+                                            Pass{},
+                                            Bilinear{1.f, 1.f});
+
+            supported = supported || op_ptr->IsSupportedArgument(argument_ptr.get());
+        }
+        return supported;
+    }
+};
+
+TEST(TestContractionInterface, IncorrectNumDims)
+{
+    std::vector<std::vector<ck::index_t>> Dims    = {{4, 4}, {4, 4, 4, 4}, {4, 4, 4, 4, 4, 4}};
+    std::vector<std::vector<ck::index_t>> Strides = {{1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}};
+    ContractionDeviceOpWrapper<F32, F32, F32, F32, 1> wrapper_1d;
+    ContractionDeviceOpWrapper<F32, F32, F32, F32, 2> wrapper_2d;
+    ContractionDeviceOpWrapper<F32, F32, F32, F32, 3> wrapper_3d;
+    EXPECT_FALSE(wrapper_1d.IsSupportedInstance(Dims[0], Strides[0]));
+    EXPECT_TRUE(wrapper_2d.IsSupportedInstance(Dims[1], Strides[1]));
+    EXPECT_FALSE(wrapper_3d.IsSupportedInstance(Dims[2], Strides[2]));
+}
+
+TEST(TestContractionInterface, IncorrectDataTypes)
+{
+    std::vector<ck::index_t> Dims    = {4, 4, 4, 4};
+    std::vector<ck::index_t> Strides = {64, 16, 4, 1};
+    ContractionDeviceOpWrapper<F32, F32, F64, F64, 2> wrapper_1;
+    ContractionDeviceOpWrapper<F64, F64, F32, F32, 2> wrapper_2;
+    EXPECT_FALSE(wrapper_1.IsSupportedInstance(Dims, Strides));
+    EXPECT_FALSE(wrapper_2.IsSupportedInstance(Dims, Strides));
+}
+
+TEST(TestContractionSupportedArgs, ABMemoryAccess)
+{
+    std::vector<ck::index_t> Dims           = {4, 4, 4, 4};
+    std::vector<ck::index_t> Strides        = {64, 16, 4, 1};
+    std::vector<ck::index_t> StridesM1      = {4, 1, 64, 16};
+    std::vector<ck::index_t> StridesK1      = {64, 16, 4, 1};
+    std::vector<ck::index_t> InvalidStrides = {4, 4, 4, 4};
+    // Memory access to A
+    ContractionInstanceWrapper<1, 2, 4> wrapperA1;
+    ContractionInstanceWrapper<2, 2, 4> wrapperA2;
+    EXPECT_FALSE(
+        wrapperA1.isSupported(Dims, Dims, Dims, Dims, InvalidStrides, Strides, Strides, Strides));
+    EXPECT_FALSE(
+        wrapperA2.isSupported(Dims, Dims, Dims, Dims, InvalidStrides, Strides, Strides, Strides));
+    EXPECT_TRUE(
+        wrapperA1.isSupported(Dims, Dims, Dims, Dims, StridesM1, Strides, Strides, Strides));
+    EXPECT_TRUE(
+        wrapperA2.isSupported(Dims, Dims, Dims, Dims, StridesK1, Strides, Strides, Strides));
+    // Memory access to B
+    ContractionInstanceWrapper<2, 1, 4> wrapperB1;
+    ContractionInstanceWrapper<2, 2, 4> wrapperB2;
+    EXPECT_FALSE(
+        wrapperB1.isSupported(Dims, Dims, Dims, Dims, Strides, InvalidStrides, Strides, Strides));
+    EXPECT_FALSE(
+        wrapperB2.isSupported(Dims, Dims, Dims, Dims, Strides, InvalidStrides, Strides, Strides));
+    EXPECT_TRUE(
+        wrapperB1.isSupported(Dims, Dims, Dims, Dims, Strides, StridesM1, Strides, Strides));
+    EXPECT_TRUE(
+        wrapperB2.isSupported(Dims, Dims, Dims, Dims, Strides, StridesK1, Strides, Strides));
+}
+
+TEST(TestContractionSupportedArgs, DEMemoryAccess)
+{
+    std::vector<ck::index_t> Dims           = {4, 4, 4, 4};
+    std::vector<ck::index_t> Strides        = {64, 16, 4, 1};
+    std::vector<ck::index_t> InvalidStrides = {64, 16, 1, 4};
+    ContractionInstanceWrapper<2, 2, 4> wrapper;
+    // Memory access to D
+    EXPECT_FALSE(
+        wrapper.isSupported(Dims, Dims, Dims, Dims, Strides, Strides, InvalidStrides, Strides));
+    EXPECT_TRUE(wrapper.isSupported(Dims, Dims, Dims, Dims, Strides, Strides, Strides, Strides));
+    // Memory access to E
+    EXPECT_FALSE(
+        wrapper.isSupported(Dims, Dims, Dims, Dims, Strides, Strides, Strides, InvalidStrides));
+    EXPECT_TRUE(wrapper.isSupported(Dims, Dims, Dims, Dims, Strides, Strides, Strides, Strides));
+}
-- 
GitLab


From 3cff34042385ee7dadf9252fc667b1285d4cee38 Mon Sep 17 00:00:00 2001
From: Sam Wu <sam.wu2@amd.com>
Date: Thu, 18 May 2023 11:08:38 -0600
Subject: [PATCH 39/71] Documentation Updates (#710)

* update documentation dependencies

add version number to docs

rename doc config directories

enable more doc formats on rtd

add license section in docs
---
 .github/dependabot.yml                    |   2 +-
 .gitignore                                |   2 +-
 .readthedocs.yaml                         |   4 +-
 README.md                                 |  16 +-
 docs/.sphinx/_toc.yml.in                  |   1 -
 docs/conf.py                              |  15 +-
 docs/{.doxygen => doxygen}/Doxyfile       |   0
 docs/license.rst                          |   6 +
 docs/sphinx/_toc.yml.in                   |  10 ++
 docs/{.sphinx => sphinx}/requirements.in  |   2 +-
 docs/{.sphinx => sphinx}/requirements.txt | 171 ++++------------------
 11 files changed, 79 insertions(+), 150 deletions(-)
 delete mode 100644 docs/.sphinx/_toc.yml.in
 rename docs/{.doxygen => doxygen}/Doxyfile (100%)
 create mode 100644 docs/license.rst
 create mode 100644 docs/sphinx/_toc.yml.in
 rename docs/{.sphinx => sphinx}/requirements.in (54%)
 rename docs/{.sphinx => sphinx}/requirements.txt (51%)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 9cdf2d670..276690bd4 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -6,7 +6,7 @@
 version: 2
 updates:
   - package-ecosystem: "pip" # See documentation for possible values
-    directory: "/docs/.sphinx" # Location of package manifests
+    directory: "/docs/sphinx" # Location of package manifests
     open-pull-requests-limit: 10
     schedule:
       interval: "daily"
diff --git a/.gitignore b/.gitignore
index 362fb9e2e..7af066c82 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,10 +49,10 @@ build*
 install.dir*
 
 # documentation artifacts
-build/
 _build/
 _images/
 _static/
 _templates/
 _toc.yml
 docBin/
+_doxygen/
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index b73953683..5f50df252 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -11,8 +11,8 @@ build:
 sphinx:
    configuration: docs/conf.py
 
-formats: [htmlzip]
+formats: [htmlzip, pdf, epub]
 
 python:
    install:
-   - requirements: docs/.sphinx/requirements.txt
+   - requirements: docs/sphinx/requirements.txt
diff --git a/README.md b/README.md
index 04199f11b..a45f61a37 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # Composable Kernel
 
 ## Methodology
+
 Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
 
 CK utilizes two concepts to achieve performance portability and code maintainability:
@@ -10,6 +11,7 @@ CK utilizes two concepts to achieve performance portability and code maintainabi
 ![ALT](/docs/data/ck_component.png "CK Components")
 
 ## Code Structure
+
 Current CK library are structured into 4 layers:
 * "Templated Tile Operators" layer
 * "Templated Kernel and Invoker" layer
@@ -24,30 +26,35 @@ Run the steps below to build documentation locally.
 
 ```
 cd docs
-pip3 install -r .sphinx/requirements.txt
+pip3 install -r sphinx/requirements.txt
 python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
 ```
 
 ## Contributors
+
 The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md)
 
 ## Citation
+
 If you use CK, please use following citations:
 * CK paper will be freely available on arXiv soon: [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???)
 * [CITATION.cff](/CITATION.cff)
 
 ## License
+
 CK is released under the MIT license. [License File](/LICENSE)
 
 
 # Build CK
 
 ## Build docker image
+
 ```bash
 DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
 ```
 
 ## Launch docker
+
 ```bash
 docker run                                     \
 -it                                            \
@@ -60,10 +67,12 @@ ck:latest                                      \
 ```
 
 ## Build CK
+
 ```bash
 mkdir build && cd build
 
 # Need to specify target ID, example below is for gfx908 and gfx90a
+
 cmake                                                                                             \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
@@ -74,6 +83,7 @@ cmake
 ```
 
 ### Build examples and tests
+
 ```bash
  make -j examples tests
  make test
@@ -83,21 +93,25 @@ Instructions for running each individual examples are under [example](/example)
 
 
 ## Build ckProfiler
+
 ```bash
  make -j ckProfiler
 ```
 Instructions for running ckProfiler are under [profiler](/profiler)
 
 ## Install CK
+
 ```bash
 make install
 ```
 
 ## Using CK as pre-built kernel library
+
 Instructions for using CK as a pre-built kernel library are under [client_example](/client_example)
 
 ## Caveat
 ### Kernel Timing and Verification
+
 CK's own kernel timer will warn up kernel once, and then run it multiple times
 to get average kernel time. For some kernels that use atomic add, this will cause
 output buffer to be accumulated multiple times, causing verification failure.
diff --git a/docs/.sphinx/_toc.yml.in b/docs/.sphinx/_toc.yml.in
deleted file mode 100644
index ff2124887..000000000
--- a/docs/.sphinx/_toc.yml.in
+++ /dev/null
@@ -1 +0,0 @@
-root: index
diff --git a/docs/conf.py b/docs/conf.py
index 3ec81ee9d..0de590da1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -4,10 +4,21 @@
 # list see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 
+import subprocess
+
 from rocm_docs import ROCmDocs
 
-docs_core = ROCmDocs("Composable Kernel Documentation")
-docs_core.run_doxygen()
+
+name = "Composable Kernel"
+get_version = r'sed -n -e "s/^rocm_setup_version(.* \([0-9\.]\{1,\}\).*/\1/p" ../CMakeLists.txt'
+version = subprocess.getoutput(get_version)
+if len(version) > 0:
+    name = f"{name} {version}"
+
+external_toc_path = "./sphinx/_toc.yml"
+
+docs_core = ROCmDocs(f"{name} Documentation")
+docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/docBin/xml")
 docs_core.setup()
 
 mathjax3_config = {
diff --git a/docs/.doxygen/Doxyfile b/docs/doxygen/Doxyfile
similarity index 100%
rename from docs/.doxygen/Doxyfile
rename to docs/doxygen/Doxyfile
diff --git a/docs/license.rst b/docs/license.rst
new file mode 100644
index 000000000..ddb544496
--- /dev/null
+++ b/docs/license.rst
@@ -0,0 +1,6 @@
+=======
+License
+=======
+
+.. include:: ../LICENSE
+   :literal:
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
new file mode 100644
index 000000000..83dd1e7b1
--- /dev/null
+++ b/docs/sphinx/_toc.yml.in
@@ -0,0 +1,10 @@
+# Anywhere {branch} is used, the branch name will be substituted.
+# These comments will also be removed.
+defaults:
+  numbered: False
+  maxdepth: 6
+root: index
+subtrees:
+  - caption: About
+    entries:
+      - file: license
diff --git a/docs/.sphinx/requirements.in b/docs/sphinx/requirements.in
similarity index 54%
rename from docs/.sphinx/requirements.in
rename to docs/sphinx/requirements.in
index 1905de6e6..4bdf41b95 100644
--- a/docs/.sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==0.2.0
+rocm-docs-core==0.10.3
 sphinxcontrib-bibtex==2.5.0
diff --git a/docs/.sphinx/requirements.txt b/docs/sphinx/requirements.txt
similarity index 51%
rename from docs/.sphinx/requirements.txt
rename to docs/sphinx/requirements.txt
index d1698b285..097acba22 100644
--- a/docs/.sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -1,25 +1,17 @@
 #
-# This file is autogenerated by pip-compile with Python 3.10
+# This file is autogenerated by pip-compile with Python 3.8
 # by the following command:
 #
-#    pip-compile .sphinx/requirements.in
+#    pip-compile requirements.in
 #
 accessible-pygments==0.0.3
     # via pydata-sphinx-theme
 alabaster==0.7.13
     # via sphinx
-asttokens==2.2.1
-    # via stack-data
-attrs==22.2.0
-    # via
-    #   jsonschema
-    #   jupyter-cache
 babel==2.12.1
     # via
     #   pydata-sphinx-theme
     #   sphinx
-backcall==0.2.0
-    # via ipython
 beautifulsoup4==4.11.2
     # via pydata-sphinx-theme
 breathe==4.34.0
@@ -27,19 +19,15 @@ breathe==4.34.0
 certifi==2022.12.7
     # via requests
 cffi==1.15.1
-    # via pynacl
+    # via
+    #   cryptography
+    #   pynacl
 charset-normalizer==3.1.0
     # via requests
 click==8.1.3
-    # via
-    #   jupyter-cache
-    #   sphinx-external-toc
-comm==0.1.2
-    # via ipykernel
-debugpy==1.6.6
-    # via ipykernel
-decorator==5.1.1
-    # via ipython
+    # via sphinx-external-toc
+cryptography==40.0.2
+    # via pyjwt
 deprecated==1.2.13
     # via pygithub
 docutils==0.16
@@ -48,52 +36,26 @@ docutils==0.16
     #   myst-parser
     #   pybtex-docutils
     #   pydata-sphinx-theme
-    #   rocm-docs-core
     #   sphinx
     #   sphinxcontrib-bibtex
-executing==1.2.0
-    # via stack-data
-fastjsonschema==2.16.3
-    # via nbformat
 gitdb==4.0.10
     # via gitpython
 gitpython==3.1.31
     # via rocm-docs-core
-greenlet==2.0.2
-    # via sqlalchemy
 idna==3.4
     # via requests
 imagesize==1.4.1
     # via sphinx
 importlib-metadata==6.0.0
     # via
-    #   jupyter-cache
-    #   myst-nb
-ipykernel==6.21.3
-    # via myst-nb
-ipython==8.11.0
-    # via
-    #   ipykernel
-    #   myst-nb
-jedi==0.18.2
-    # via ipython
+    #   sphinx
+    #   sphinxcontrib-bibtex
+importlib-resources==5.12.0
+    # via rocm-docs-core
 jinja2==3.1.2
     # via
     #   myst-parser
     #   sphinx
-jsonschema==4.17.3
-    # via nbformat
-jupyter-cache==0.5.0
-    # via myst-nb
-jupyter-client==8.0.3
-    # via
-    #   ipykernel
-    #   nbclient
-jupyter-core==5.3.0
-    # via
-    #   ipykernel
-    #   jupyter-client
-    #   nbformat
 latexcodec==2.0.1
     # via pybtex
 linkify-it-py==1.0.3
@@ -104,54 +66,16 @@ markdown-it-py==2.2.0
     #   myst-parser
 markupsafe==2.1.2
     # via jinja2
-matplotlib-inline==0.1.6
-    # via
-    #   ipykernel
-    #   ipython
 mdit-py-plugins==0.3.5
     # via myst-parser
 mdurl==0.1.2
     # via markdown-it-py
-myst-nb==0.17.1
+myst-parser[linkify]==1.0.0
     # via rocm-docs-core
-myst-parser[linkify]==0.18.1
-    # via
-    #   myst-nb
-    #   rocm-docs-core
-nbclient==0.5.13
-    # via
-    #   jupyter-cache
-    #   myst-nb
-nbformat==5.7.3
-    # via
-    #   jupyter-cache
-    #   myst-nb
-    #   nbclient
-nest-asyncio==1.5.6
-    # via
-    #   ipykernel
-    #   nbclient
 packaging==23.0
     # via
-    #   ipykernel
     #   pydata-sphinx-theme
     #   sphinx
-parso==0.8.3
-    # via jedi
-pexpect==4.8.0
-    # via ipython
-pickleshare==0.7.5
-    # via ipython
-platformdirs==3.1.1
-    # via jupyter-core
-prompt-toolkit==3.0.38
-    # via ipython
-psutil==5.9.4
-    # via ipykernel
-ptyprocess==0.7.0
-    # via pexpect
-pure-eval==0.2.2
-    # via stack-data
 pybtex==0.24.0
     # via
     #   pybtex-docutils
@@ -160,57 +84,47 @@ pybtex-docutils==1.0.2
     # via sphinxcontrib-bibtex
 pycparser==2.21
     # via cffi
-pydata-sphinx-theme==0.13.1
-    # via sphinx-book-theme
-pygithub==1.57
+pydata-sphinx-theme==0.13.3
+    # via
+    #   rocm-docs-core
+    #   sphinx-book-theme
+pygithub==1.58.2
     # via rocm-docs-core
 pygments==2.14.0
     # via
     #   accessible-pygments
-    #   ipython
     #   pydata-sphinx-theme
     #   sphinx
-pyjwt==2.6.0
+pyjwt[crypto]==2.6.0
     # via pygithub
 pynacl==1.5.0
     # via pygithub
-pyrsistent==0.19.3
-    # via jsonschema
-python-dateutil==2.8.2
-    # via jupyter-client
+pytz==2023.3
+    # via babel
 pyyaml==6.0
     # via
-    #   jupyter-cache
-    #   myst-nb
     #   myst-parser
     #   pybtex
     #   sphinx-external-toc
-pyzmq==25.0.1
-    # via
-    #   ipykernel
-    #   jupyter-client
 requests==2.28.2
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==0.2.0
-    # via -r .sphinx/requirements.in
+rocm-docs-core==0.10.3
+    # via -r requirements.in
 six==1.16.0
     # via
-    #   asttokens
     #   latexcodec
     #   pybtex
-    #   python-dateutil
 smmap==5.0.0
     # via gitdb
 snowballstemmer==2.2.0
     # via sphinx
 soupsieve==2.4
     # via beautifulsoup4
-sphinx==4.3.1
+sphinx==5.3.0
     # via
     #   breathe
-    #   myst-nb
     #   myst-parser
     #   pydata-sphinx-theme
     #   rocm-docs-core
@@ -220,7 +134,7 @@ sphinx==4.3.1
     #   sphinx-external-toc
     #   sphinx-notfound-page
     #   sphinxcontrib-bibtex
-sphinx-book-theme==1.0.0rc2
+sphinx-book-theme==1.0.1
     # via rocm-docs-core
 sphinx-copybutton==0.5.1
     # via rocm-docs-core
@@ -233,7 +147,7 @@ sphinx-notfound-page==0.8.3
 sphinxcontrib-applehelp==1.0.4
     # via sphinx
 sphinxcontrib-bibtex==2.5.0
-    # via -r .sphinx/requirements.in
+    # via -r requirements.in
 sphinxcontrib-devhelp==1.0.2
     # via sphinx
 sphinxcontrib-htmlhelp==2.0.1
@@ -244,40 +158,15 @@ sphinxcontrib-qthelp==1.0.3
     # via sphinx
 sphinxcontrib-serializinghtml==1.1.5
     # via sphinx
-sqlalchemy==1.4.46
-    # via jupyter-cache
-stack-data==0.6.2
-    # via ipython
-tabulate==0.9.0
-    # via jupyter-cache
-tornado==6.2
-    # via
-    #   ipykernel
-    #   jupyter-client
-traitlets==5.9.0
-    # via
-    #   comm
-    #   ipykernel
-    #   ipython
-    #   jupyter-client
-    #   jupyter-core
-    #   matplotlib-inline
-    #   nbclient
-    #   nbformat
 typing-extensions==4.5.0
-    # via
-    #   myst-nb
-    #   myst-parser
+    # via pydata-sphinx-theme
 uc-micro-py==1.0.1
     # via linkify-it-py
 urllib3==1.26.15
     # via requests
-wcwidth==0.2.6
-    # via prompt-toolkit
 wrapt==1.15.0
     # via deprecated
 zipp==3.15.0
-    # via importlib-metadata
-
-# The following packages are considered to be unsafe in a requirements file:
-# setuptools
+    # via
+    #   importlib-metadata
+    #   importlib-resources
-- 
GitLab


From d821d1e54f6ce8131070a1253dfc4dd6662d3d85 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 23 May 2023 09:23:16 -0700
Subject: [PATCH 40/71] Enable gemm_dl and other kernels on Navi3x. (#714)

* enable dl kernels on navi3

* do not build xdl tests and examples on Navi

* run tests before building everything on jenkins

* disable gemm_bilinear on gfx1030

* add gpu targets to installer on Navi

* put tests in the same order as before

* reduce the number of navi targets in CI

* build CI installed for gfx940 as well

* only build for MI300 during QA runs
---
 Jenkinsfile                                   | 32 +++++++++++---
 example/02_gemm_bilinear/CMakeLists.txt       |  4 +-
 example/03_gemm_bias_relu/CMakeLists.txt      |  4 +-
 .../04_gemm_add_add_fastgelu/CMakeLists.txt   | 32 +++++++-------
 example/09_convnd_fwd/CMakeLists.txt          | 15 +++----
 .../CMakeLists.txt                            | 31 +++++++-------
 example/14_gemm_quantization/CMakeLists.txt   |  6 ++-
 .../CMakeLists.txt                            | 42 ++++++++++---------
 example/17_convnd_bwd_data/CMakeLists.txt     |  7 ++--
 example/18_batched_gemm_reduce/CMakeLists.txt |  4 +-
 .../20_grouped_conv_bwd_weight/CMakeLists.txt | 11 ++---
 .../run_grouped_conv_bwd_weight_example.inc   |  4 +-
 example/21_gemm_layernorm/CMakeLists.txt      | 10 +++--
 .../CMakeLists.txt                            | 36 ++++++++--------
 example/31_batched_gemm_gemm/CMakeLists.txt   | 20 +++++----
 example/35_splitK_gemm/CMakeLists.txt         | 21 +++++-----
 .../CMakeLists.txt                            | 13 +++---
 .../40_conv2d_fwd_quantization/CMakeLists.txt | 12 +++---
 .../41_grouped_conv_conv_fwd/CMakeLists.txt   | 18 ++++----
 .../CMakeLists.txt                            |  4 +-
 ..._conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp |  8 ++--
 ...ice_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp |  7 +++-
 .../device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp |  4 +-
 .../gpu/device/impl/device_gemm_dl.hpp        |  4 +-
 .../device/impl/device_gemm_multiple_d_dl.hpp |  8 ++--
 ...uped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp |  4 +-
 .../device_grouped_gemm_multiple_d_dl.hpp     |  5 ++-
 test/batched_gemm/CMakeLists.txt              | 26 ++++++------
 test/batched_gemm_gemm/CMakeLists.txt         | 10 +++--
 test/batched_gemm_reduce/CMakeLists.txt       |  8 ++--
 test/batched_gemm_softmax_gemm/CMakeLists.txt | 10 +++--
 .../CMakeLists.txt                            | 28 +++++++------
 test/contraction/CMakeLists.txt               |  6 ++-
 test/convnd_bwd_data/CMakeLists.txt           |  6 ++-
 test/convnd_fwd/CMakeLists.txt                |  6 ++-
 test/gemm_layernorm/CMakeLists.txt            | 13 +++---
 test/gemm_split_k/CMakeLists.txt              |  8 ++--
 test/grouped_convnd_bwd_weight/CMakeLists.txt |  6 ++-
 test/grouped_gemm/CMakeLists.txt              |  8 ++--
 39 files changed, 291 insertions(+), 210 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 83559c223..fbff349fc 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -493,10 +493,11 @@ def Build_CK(Map conf=[:]){
                 {
                     cmake_build(conf)
                     dir("build"){
+                        //run tests and examples 	
+                        sh 'make -j\$(( \$(nproc) / 2 )) check'
                         if (navi_node == 0 ){
-                           //run tests and examples on all nodes except Navi
-                           sh 'make -j check'
-                           //we only need the ckProfiler to run the performance tests, so we pack and stash it
+                            //we only need the ckProfiler to run the performance tests, so we pack and stash it
+                            //do not stash profiler on Navi nodes
                            sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
                            stash "ckProfiler.tar.gz"
                         }
@@ -686,12 +687,31 @@ pipeline {
         {
             parallel
             {
+                stage("Build CK and run Tests on MI100/MI200/MI300")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_FULL_QA.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx908 || gfx90a") }
+                    environment{
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940" """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                    }
+                    steps{
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                    }
+                }
                 stage("Build CK and run Tests on MI100/MI200")
                 {
+                    when {
+                        beforeAgent true
+                        expression { !params.RUN_FULL_QA.toBoolean() }
+                    }
                     agent{ label rocmnode("gfx908 || gfx90a") }
                     environment{
                         setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908,gfx90a" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
                     }
                     steps{
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
@@ -705,8 +725,8 @@ pipeline {
                     }
                     agent{ label rocmnode("navi21") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install """ 
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030;gfx1100;gfx1101;gfx1102" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" """ 
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
 
                     }
                     steps{
diff --git a/example/02_gemm_bilinear/CMakeLists.txt b/example/02_gemm_bilinear/CMakeLists.txt
index 16a821102..eecec2437 100644
--- a/example/02_gemm_bilinear/CMakeLists.txt
+++ b/example/02_gemm_bilinear/CMakeLists.txt
@@ -1,4 +1,6 @@
-add_example_executable(example_gemm_bilinear_xdl_fp16 gemm_bilinear_xdl_fp16.cpp)
 if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
     add_example_executable(example_gemm_bilinear_wmma_fp16 gemm_bilinear_wmma_fp16.cpp)
 endif()
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+    add_example_executable(example_gemm_bilinear_xdl_fp16 gemm_bilinear_xdl_fp16.cpp)
+endif()
diff --git a/example/03_gemm_bias_relu/CMakeLists.txt b/example/03_gemm_bias_relu/CMakeLists.txt
index 35c54abac..8834a910f 100644
--- a/example/03_gemm_bias_relu/CMakeLists.txt
+++ b/example/03_gemm_bias_relu/CMakeLists.txt
@@ -1 +1,3 @@
-add_example_executable(example_gemm_bias_relu_xdl_fp16 gemm_bias_relu_xdl_fp16.cpp)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+    add_example_executable(example_gemm_bias_relu_xdl_fp16 gemm_bias_relu_xdl_fp16.cpp)
+endif()
\ No newline at end of file
diff --git a/example/04_gemm_add_add_fastgelu/CMakeLists.txt b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
index c75c5ba51..a706830b6 100644
--- a/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
@@ -1,17 +1,19 @@
-add_custom_target(example_gemm_add_add_fastgelu_xdl)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+    add_custom_target(example_gemm_add_add_fastgelu_xdl)
 
-add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
-add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
-add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp)
-if(USE_BITINT_EXTENSION_INT4)
-  add_example_executable(example_gemm_add_add_fastgelu_xdl_int4 gemm_add_add_fastgelu_xdl_int4.cpp)
-endif(USE_BITINT_EXTENSION_INT4)
-add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp)
+    add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
+    add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
+    add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp)
+    if(USE_BITINT_EXTENSION_INT4)
+       add_example_executable(example_gemm_add_add_fastgelu_xdl_int4 gemm_add_add_fastgelu_xdl_int4.cpp)
+    endif(USE_BITINT_EXTENSION_INT4)
+    add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp)
 
-add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_bf16)
-add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp16)
-add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp32)
-if(USE_BITINT_EXTENSION_INT4)
-  add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4)
-endif(USE_BITINT_EXTENSION_INT4)
-add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int8)
+    add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_bf16)
+    add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp16)
+    add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp32)
+    if(USE_BITINT_EXTENSION_INT4)
+        add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4)
+    endif(USE_BITINT_EXTENSION_INT4)
+    add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int8)
+endif()
\ No newline at end of file
diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt
index e0a53005b..1bcf2d148 100644
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
@@ -1,10 +1,11 @@
-add_example_executable(example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp)
-add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
-add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
-add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
-# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
-add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
-
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+    add_example_executable(example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp)
+    add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
+    add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
+    add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
+    # FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
+    add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
+endif()
 add_example_executable(example_convnd_fwd_dl_fp16 convnd_fwd_dl_fp16.cpp)
 add_example_executable(example_convnd_fwd_dl_fp32 convnd_fwd_dl_fp32.cpp)
 add_example_executable(example_convnd_fwd_dl_int8 convnd_fwd_dl_int8.cpp)
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt b/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
index 98941b4db..de26462f6 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
@@ -1,16 +1,15 @@
-add_custom_target(example_convnd_fwd_reduce_xdl)
-
-add_example_executable(example_convnd_fwd_max_xdl_int8 convnd_fwd_max_xdl_int8.cpp)
-add_example_executable_no_testing(example_convnd_fwd_max_xdl_bf16 convnd_fwd_max_xdl_bf16.cpp)
-add_example_executable_no_testing(example_convnd_fwd_max_xdl_fp16 convnd_fwd_max_xdl_fp16.cpp)
-add_example_executable(example_convnd_fwd_max_xdl_fp32 convnd_fwd_max_xdl_fp32.cpp)
-
-add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int8)
-add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_bf16)
-add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp16)
-add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp32)
-
-if(USE_BITINT_EXTENSION_INT4)
-  add_example_executable(example_convnd_fwd_max_xdl_int4 convnd_fwd_max_xdl_int4.cpp)
-  add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int4)
-endif(USE_BITINT_EXTENSION_INT4)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(example_convnd_fwd_reduce_xdl)
+   add_example_executable(example_convnd_fwd_max_xdl_int8 convnd_fwd_max_xdl_int8.cpp)
+   add_example_executable_no_testing(example_convnd_fwd_max_xdl_bf16 convnd_fwd_max_xdl_bf16.cpp)
+   add_example_executable_no_testing(example_convnd_fwd_max_xdl_fp16 convnd_fwd_max_xdl_fp16.cpp)
+   add_example_executable(example_convnd_fwd_max_xdl_fp32 convnd_fwd_max_xdl_fp32.cpp)
+   add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int8)
+   add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_bf16)
+   add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp16)
+   add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp32)
+   if(USE_BITINT_EXTENSION_INT4)
+      add_example_executable(example_convnd_fwd_max_xdl_int4 convnd_fwd_max_xdl_int4.cpp)
+      add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int4)
+   endif(USE_BITINT_EXTENSION_INT4)
+endif()
\ No newline at end of file
diff --git a/example/14_gemm_quantization/CMakeLists.txt b/example/14_gemm_quantization/CMakeLists.txt
index 8ea11df9c..584333e7b 100644
--- a/example/14_gemm_quantization/CMakeLists.txt
+++ b/example/14_gemm_quantization/CMakeLists.txt
@@ -2,5 +2,7 @@
 add_example_executable(example_gemm_dl_quantization_int8 gemm_dl_quantization_int8.cpp)
 
 # xdlops
-add_example_executable(example_gemm_xdl_bias_relu_quantization_int8 gemm_xdl_bias_relu_quantization_int8.cpp)
-add_example_executable(example_gemm_xdl_quantization_int8 gemm_xdl_quantization_int8.cpp)
\ No newline at end of file
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_example_executable(example_gemm_xdl_bias_relu_quantization_int8 gemm_xdl_bias_relu_quantization_int8.cpp)
+   add_example_executable(example_gemm_xdl_quantization_int8 gemm_xdl_quantization_int8.cpp)
+endif()
\ No newline at end of file
diff --git a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
index 226656a73..0f1ca777c 100644
--- a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
+++ b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
@@ -1,40 +1,42 @@
-add_custom_target(example_gemm_reduce_xdl)
-add_custom_target(example_gemm_reduce_xdl_max)
-add_custom_target(example_gemm_reduce_xdl_mean_meansquare)
-add_custom_target(example_gemm_add_add_mean_meansquare_xdl)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(example_gemm_reduce_xdl)
+   add_custom_target(example_gemm_reduce_xdl_max)
+   add_custom_target(example_gemm_reduce_xdl_mean_meansquare)
+   add_custom_target(example_gemm_add_add_mean_meansquare_xdl)
 
-add_example_executable(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp)
-add_example_executable(example_gemm_max_xdl_int8 gemm_max_xdl_int8.cpp)
-add_example_executable(example_gemm_max_xdl_fp32 gemm_max_xdl_fp32.cpp)
-add_example_executable(example_gemm_max_xdl_bf16 gemm_max_xdl_bf16.cpp)
+   add_example_executable(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp)
+   add_example_executable(example_gemm_max_xdl_int8 gemm_max_xdl_int8.cpp)
+   add_example_executable(example_gemm_max_xdl_fp32 gemm_max_xdl_fp32.cpp)
+   add_example_executable(example_gemm_max_xdl_bf16 gemm_max_xdl_bf16.cpp)
 
-add_example_executable(example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp)
+   add_example_executable(example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp)
 
-add_example_executable(example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp)
-add_example_executable(example_gemm_mean_meansquare_xdl_fp32 gemm_mean_meansquare_xdl_fp32.cpp)
-add_example_executable(example_gemm_mean_meansquare_xdl_bf16 gemm_mean_meansquare_xdl_bf16.cpp)
-add_example_executable(example_gemm_add_addsquare_xdl_int8 gemm_add_addsquare_xdl_int8.cpp)
+   add_example_executable(example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp)
+   add_example_executable(example_gemm_mean_meansquare_xdl_fp32 gemm_mean_meansquare_xdl_fp32.cpp)
+   add_example_executable(example_gemm_mean_meansquare_xdl_bf16 gemm_mean_meansquare_xdl_bf16.cpp)
+   add_example_executable(example_gemm_add_addsquare_xdl_int8 gemm_add_addsquare_xdl_int8.cpp)
 
-add_dependencies(example_gemm_reduce_xdl_max
+   add_dependencies(example_gemm_reduce_xdl_max
                  example_gemm_max_xdl_bf16
                  example_gemm_max_xdl_fp16
                  example_gemm_max_xdl_fp32
                  example_gemm_max_xdl_int8)
 
-add_dependencies(example_gemm_reduce_xdl_mean_meansquare
+   add_dependencies(example_gemm_reduce_xdl_mean_meansquare
                  example_gemm_mean_meansquare_xdl_fp16
                  example_gemm_mean_meansquare_xdl_fp32
                  example_gemm_mean_meansquare_xdl_bf16
                  example_gemm_add_addsquare_xdl_int8)
 
-add_dependencies(example_gemm_add_add_mean_meansquare_xdl example_gemm_add_add_mean_meansquare_xdl_fp16)
+   add_dependencies(example_gemm_add_add_mean_meansquare_xdl example_gemm_add_add_mean_meansquare_xdl_fp16)
 
-add_dependencies(example_gemm_reduce_xdl
+   add_dependencies(example_gemm_reduce_xdl
                  example_gemm_reduce_xdl_mean_meansquare
                  example_gemm_reduce_xdl_max
                  example_gemm_add_add_mean_meansquare_xdl)
 
-if(USE_BITINT_EXTENSION_INT4)
-  add_example_executable(example_gemm_max_xdl_int4 gemm_max_xdl_int4.cpp)
-  add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_int4)
+   if(USE_BITINT_EXTENSION_INT4)
+      add_example_executable(example_gemm_max_xdl_int4 gemm_max_xdl_int4.cpp)
+      add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_int4)
+   endif()
 endif()
diff --git a/example/17_convnd_bwd_data/CMakeLists.txt b/example/17_convnd_bwd_data/CMakeLists.txt
index fa4e65d96..ed95946c4 100644
--- a/example/17_convnd_bwd_data/CMakeLists.txt
+++ b/example/17_convnd_bwd_data/CMakeLists.txt
@@ -1,5 +1,6 @@
-add_example_executable(example_convnd_bwd_data_xdl_fp16 convnd_bwd_data_xdl_fp16.cpp)
-target_link_libraries(example_convnd_bwd_data_xdl_fp16 PRIVATE utility)
-
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_example_executable(example_convnd_bwd_data_xdl_fp16 convnd_bwd_data_xdl_fp16.cpp)
+   target_link_libraries(example_convnd_bwd_data_xdl_fp16 PRIVATE utility)
+endif()
 add_example_executable(example_convnd_bwd_data_dl_fp16 convnd_bwd_data_dl_fp16.cpp)
 target_link_libraries(example_convnd_bwd_data_dl_fp16 PRIVATE utility)
diff --git a/example/18_batched_gemm_reduce/CMakeLists.txt b/example/18_batched_gemm_reduce/CMakeLists.txt
index 99fc0043d..0c3648dbf 100644
--- a/example/18_batched_gemm_reduce/CMakeLists.txt
+++ b/example/18_batched_gemm_reduce/CMakeLists.txt
@@ -1,2 +1,4 @@
-add_example_executable(example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_example_executable(example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp)
+endif()
 
diff --git a/example/20_grouped_conv_bwd_weight/CMakeLists.txt b/example/20_grouped_conv_bwd_weight/CMakeLists.txt
index cbe4f5f48..0ee39ac84 100644
--- a/example/20_grouped_conv_bwd_weight/CMakeLists.txt
+++ b/example/20_grouped_conv_bwd_weight/CMakeLists.txt
@@ -1,11 +1,12 @@
-add_custom_target(example_grouped_conv_bwd_weight)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(example_grouped_conv_bwd_weight)
 
-add_example_executable(example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp)
-add_example_executable(example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp)
+   add_example_executable(example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp)
+   add_example_executable(example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp)
 
-
-add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16
+   add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16
                                                  example_grouped_conv_bwd_weight_xdl_bf16)
+endif()
 
 add_custom_target(example_grouped_conv_bwd_weight_dl)
 
diff --git a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
index 789181237..d46b37476 100644
--- a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
+++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
@@ -18,7 +18,9 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
     // Set split_k = 2 for xdl op, split_k = 1 for dl
     // Dl op doesn't support split_k > 1
     // TODO: Add Dl op split_k > 1 support
-    if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030"))
+    if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
+         ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
+         ck::get_device_name() == "gfx1102"))
     {
         split_k = 2;
     }
diff --git a/example/21_gemm_layernorm/CMakeLists.txt b/example/21_gemm_layernorm/CMakeLists.txt
index 2eb7052e1..7f974221b 100644
--- a/example/21_gemm_layernorm/CMakeLists.txt
+++ b/example/21_gemm_layernorm/CMakeLists.txt
@@ -1,4 +1,6 @@
-add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_welford_fp16 gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp)
-add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_naive_fp16 gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp)
-add_example_executable(example_gemm_layernorm_xdl_naive_fp16 gemm_layernorm_xdl_naive_fp16.cpp)
-add_example_executable(example_gemm_xdl_layernorm_naive_single_kernel_fp16 gemm_xdl_layernorm_naive_single_kernel_fp16.cpp)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_welford_fp16 gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp)
+   add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_naive_fp16 gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp)
+   add_example_executable(example_gemm_layernorm_xdl_naive_fp16 gemm_layernorm_xdl_naive_fp16.cpp)
+   add_example_executable(example_gemm_xdl_layernorm_naive_single_kernel_fp16 gemm_xdl_layernorm_naive_single_kernel_fp16.cpp)
+endif()
diff --git a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
index 4b0ea4f15..047299685 100644
--- a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
@@ -1,25 +1,23 @@
-add_custom_target(example_grouped_conv_fwd_multiple_d)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(example_grouped_conv_fwd_multiple_d)
 
-add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
-add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp)
-add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp)
-add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp)
 
-add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16)
-add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32)
-add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16)
-add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8)
-
-if(USE_BITINT_EXTENSION_INT4)
-  add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp)
-
-  add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4)
-endif() # USE_BITINT_EXTENSION_INT4
+   add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16)
+   add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32)
+   add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16)
+   add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8)
 
+   if(USE_BITINT_EXTENSION_INT4)
+      add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp)
+      add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4)
+   endif() # USE_BITINT_EXTENSION_INT4
+   add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
+   add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16)
+endif()
 if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
   add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp)
 endif()
-
-add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
-
-add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16)
diff --git a/example/31_batched_gemm_gemm/CMakeLists.txt b/example/31_batched_gemm_gemm/CMakeLists.txt
index ad40c96b4..dd9aef94a 100644
--- a/example/31_batched_gemm_gemm/CMakeLists.txt
+++ b/example/31_batched_gemm_gemm/CMakeLists.txt
@@ -1,10 +1,12 @@
-add_example_executable(example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp)
-add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp)
-add_example_executable(example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp)
-if(NOT GPU_TARGETS MATCHES "gfx940")
-	add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
-endif()
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_example_executable(example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp)
+   add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp)
+   add_example_executable(example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp)
+   if(NOT GPU_TARGETS MATCHES "gfx940")
+      add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
+   endif()
 
-if(USE_BITINT_EXTENSION_INT4)
-add_example_executable(example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp)
-endif(USE_BITINT_EXTENSION_INT4)
+   if(USE_BITINT_EXTENSION_INT4)
+      add_example_executable(example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp)
+   endif(USE_BITINT_EXTENSION_INT4)
+endif()
\ No newline at end of file
diff --git a/example/35_splitK_gemm/CMakeLists.txt b/example/35_splitK_gemm/CMakeLists.txt
index 794583954..f5a6ccb24 100644
--- a/example/35_splitK_gemm/CMakeLists.txt
+++ b/example/35_splitK_gemm/CMakeLists.txt
@@ -1,17 +1,18 @@
-add_custom_target(example_splitK_gemm_xdl)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(example_splitK_gemm_xdl)
+   add_example_executable(example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp)
+   add_example_executable(example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp)
+   add_example_executable(example_splitK_gemm_xdl_bfp16 splitK_gemm_xdl_bfp16.cpp)
+   add_example_executable(example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp)
 
-add_example_executable(example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp)
-add_example_executable(example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp)
-add_example_executable(example_splitK_gemm_xdl_bfp16 splitK_gemm_xdl_bfp16.cpp)
-add_example_executable(example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp)
-
-add_dependencies(example_splitK_gemm_xdl
+   add_dependencies(example_splitK_gemm_xdl
                  example_splitK_gemm_xdl_fp32
                  example_splitK_gemm_xdl_fp16
                  example_splitK_gemm_xdl_bfp16
                  example_splitK_gemm_xdl_int8)
 
-if(USE_BITINT_EXTENSION_INT4)
-  add_example_executable(example_splitK_gemm_xdl_int4 splitK_gemm_xdl_int4.cpp)
-  add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_int4)
+   if(USE_BITINT_EXTENSION_INT4)
+      add_example_executable(example_splitK_gemm_xdl_int4 splitK_gemm_xdl_int4.cpp)
+      add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_int4)
+   endif()
 endif()
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt b/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
index 9cf960c50..472d59c77 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
+++ b/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
@@ -1,7 +1,8 @@
-add_custom_target(example_grouped_conv_bwd_data)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(example_grouped_conv_bwd_data)
+   add_example_executable(example_grouped_conv_bwd_data_fp16 grouped_conv_bwd_data_fp16.cpp)
+   add_example_executable(example_grouped_conv_bwd_data_bias_relu_fp16 grouped_conv_bwd_data_bias_relu_fp16.cpp)
 
-add_example_executable(example_grouped_conv_bwd_data_fp16 grouped_conv_bwd_data_fp16.cpp)
-add_example_executable(example_grouped_conv_bwd_data_bias_relu_fp16 grouped_conv_bwd_data_bias_relu_fp16.cpp)
-
-add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_fp16)
-add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16)
+   add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_fp16)
+   add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16)
+endif()
\ No newline at end of file
diff --git a/example/40_conv2d_fwd_quantization/CMakeLists.txt b/example/40_conv2d_fwd_quantization/CMakeLists.txt
index 0a314cd74..c12ab7a34 100644
--- a/example/40_conv2d_fwd_quantization/CMakeLists.txt
+++ b/example/40_conv2d_fwd_quantization/CMakeLists.txt
@@ -1,21 +1,23 @@
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_example_executable(example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp)
+   add_example_executable(example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_perchannel_quantization_int8.cpp)
+   add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp)
+   add_example_executable(example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp)
+endif()
 # Conv perlayer quantization
 add_example_executable(example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp)
-add_example_executable(example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp)
 
 # Conv perchannel quantization
 add_example_executable(example_conv2d_fwd_dl_perchannel_quantization_int8 conv2d_fwd_dl_perchannel_quantization_int8.cpp)
-add_example_executable(example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_perchannel_quantization_int8.cpp)
 
 # Conv + bias + relu perlayer quantization
 add_example_executable(example_conv2d_fwd_dl_bias_relu_perlayer_quantization_int8 conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp)
-add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp)
 
 # Conv + bias + relu perchannel quantization
 add_example_executable(example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp)
-add_example_executable(example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp)
 
 # Conv + bias + tanh perlayer quantization
 add_example_executable(example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp)
 
 # Conv + bias + tanh perchannel quantization
-add_example_executable(example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp)
+add_example_executable(example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp)
\ No newline at end of file
diff --git a/example/41_grouped_conv_conv_fwd/CMakeLists.txt b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
index 4eb79371a..14f5c284a 100644
--- a/example/41_grouped_conv_conv_fwd/CMakeLists.txt
+++ b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
@@ -1,9 +1,11 @@
-add_example_executable(example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp)
-add_example_executable(example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp)
-add_example_executable(example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp)
-if(NOT GPU_TARGETS MATCHES "gfx940")
-	add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_example_executable(example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp)
+   add_example_executable(example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp)
+   add_example_executable(example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp)
+   if(NOT GPU_TARGETS MATCHES "gfx940")
+      add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
+   endif()
+   if(USE_BITINT_EXTENSION_INT4)
+      add_example_executable(example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp)
+   endif(USE_BITINT_EXTENSION_INT4)
 endif()
-if(USE_BITINT_EXTENSION_INT4)
-add_example_executable(example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp)
-endif(USE_BITINT_EXTENSION_INT4)
diff --git a/example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt b/example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
index d1b3dd4be..b60789ea3 100644
--- a/example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
+++ b/example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
@@ -1 +1,3 @@
-add_example_executable(example_gemm_bias_softmax_gemm_permute gemm_bias_softmax_gemm_permute.cpp)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_example_executable(example_gemm_bias_softmax_gemm_permute gemm_bias_softmax_gemm_permute.cpp)
+endif()
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
index 5880f5f60..618033335 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -134,8 +134,9 @@ __global__ void
             const Block2CTileMap block_2_ctile_map,
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) || \
-    defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx940__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) ||           \
+    defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx940__) || defined(__gfx1100__) || \
+    defined(__gfx1101__) || defined(__gfx1102__))
     // offset base pointer for each work-group
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -711,7 +712,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
         // check device
         if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
              ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx908" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx1100" ||
+             ck::get_device_name() == "gfx1101" || ck::get_device_name() == "gfx1102"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
index c77772faa..80c864c83 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
@@ -106,7 +106,8 @@ __global__ void
             const Block2CTileMap block_2_ctile_map,
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) || \
+    defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__))
     // offset base pointer for each work-group
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -600,7 +601,9 @@ struct DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK : public DeviceGroupedConvFwd<NDimS
         namespace ctc = tensor_layout::convolution;
 
         // check device
-        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030"))
+        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
+             ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
+             ck::get_device_name() == "gfx1102"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
index 2a2edc29b..aff25aa7c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
@@ -1393,7 +1393,9 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Dl
     static bool IsSupportedArgument(const Argument& arg)
     {
         // check device
-        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030"))
+        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
+             ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
+             ck::get_device_name() == "gfx1102"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
index af1989fc4..36366a763 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
@@ -485,7 +485,9 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030")
+        if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
+           ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
+           ck::get_device_name() == "gfx1102")
         {
             return GridwiseGemm::CheckValidity(
                 arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
index 4397b6f99..c91328ff7 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
@@ -50,8 +50,9 @@ __global__ void
             const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
             const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
-    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||             \
+    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
+    defined(__gfx1101__) || defined(__gfx1102__))
 
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(ABDataType);
@@ -553,7 +554,8 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD<ALayout,
     {
         if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx908" ||
            ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx1030" ||
-           ck::get_device_name() == "gfx940")
+           ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx1100" ||
+           ck::get_device_name() == "gfx1101" || ck::get_device_name() == "gfx1102")
         {
             return GridwiseGemm::CheckValidity(
                 arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.e_grid_desc_m_n_);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp
index 9529744f3..0473eaf76 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp
@@ -1027,7 +1027,9 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
     static bool IsSupportedArgument(const Argument& arg)
     {
         // check device
-        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030"))
+        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
+             ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
+             ck::get_device_name() == "gfx1102"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
index d424f2992..e88bf8ed7 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
@@ -39,8 +39,9 @@ __global__ void
                                           const BElementwiseOperation b_element_op,
                                           const CDEElementwiseOperation cde_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
-    defined(__gfx90a__) || defined(__gfx1030__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||              \
+    defined(__gfx90a__) || defined(__gfx1030__) || defined(__gfx1100__) || defined(__gfx1101__) || \
+    defined(__gfx1102__) || defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t block_id = get_block_1d_id();
diff --git a/test/batched_gemm/CMakeLists.txt b/test/batched_gemm/CMakeLists.txt
index 0574f98e8..5552a6c98 100644
--- a/test/batched_gemm/CMakeLists.txt
+++ b/test/batched_gemm/CMakeLists.txt
@@ -1,15 +1,17 @@
-add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
-target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
-target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
+   target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
+   target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
 
-add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp)
-target_link_libraries(test_batched_gemm_fp32 PRIVATE utility)
-target_link_libraries(test_batched_gemm_fp32 PRIVATE device_batched_gemm_instance)
+   add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp)
+   target_link_libraries(test_batched_gemm_fp32 PRIVATE utility)
+   target_link_libraries(test_batched_gemm_fp32 PRIVATE device_batched_gemm_instance)
 
-add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp)
-target_link_libraries(test_batched_gemm_bf16 PRIVATE utility)
-target_link_libraries(test_batched_gemm_bf16 PRIVATE device_batched_gemm_instance)
+   add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp)
+   target_link_libraries(test_batched_gemm_bf16 PRIVATE utility)
+   target_link_libraries(test_batched_gemm_bf16 PRIVATE device_batched_gemm_instance)
 
-add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
-target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
-target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
+   add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
+   target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
+   target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
+endif()
\ No newline at end of file
diff --git a/test/batched_gemm_gemm/CMakeLists.txt b/test/batched_gemm_gemm/CMakeLists.txt
index 386809717..eff339175 100644
--- a/test/batched_gemm_gemm/CMakeLists.txt
+++ b/test/batched_gemm_gemm/CMakeLists.txt
@@ -1,5 +1,7 @@
-add_custom_target(test_batched_gemm_gemm)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(test_batched_gemm_gemm)
 
-add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
-target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
-add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
\ No newline at end of file
+   add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
+   target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
+   add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
+endif()
\ No newline at end of file
diff --git a/test/batched_gemm_reduce/CMakeLists.txt b/test/batched_gemm_reduce/CMakeLists.txt
index 4dc0b0825..1a8d7112a 100644
--- a/test/batched_gemm_reduce/CMakeLists.txt
+++ b/test/batched_gemm_reduce/CMakeLists.txt
@@ -1,3 +1,5 @@
-add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
-target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility)
-target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
+   target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility)
+   target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
+endif()
diff --git a/test/batched_gemm_softmax_gemm/CMakeLists.txt b/test/batched_gemm_softmax_gemm/CMakeLists.txt
index 1ceecefb5..dee0bb56a 100644
--- a/test/batched_gemm_softmax_gemm/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm/CMakeLists.txt
@@ -1,5 +1,7 @@
-add_custom_target(test_batched_gemm_softmax_gemm)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(test_batched_gemm_softmax_gemm)
 
-add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
-target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
-add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
\ No newline at end of file
+   add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
+   target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
+   add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
+endif()
\ No newline at end of file
diff --git a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
index 79af2b0d3..cb68af06c 100644
--- a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
@@ -1,15 +1,17 @@
-add_custom_target(test_batched_gemm_softmax_gemm_permute)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(test_batched_gemm_softmax_gemm_permute)
 
-add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
-add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
-target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
-add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
+   add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
+   add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
+   target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+   target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
+   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
 
-add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp)
-add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp)
-target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
-add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
\ No newline at end of file
+   add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp)
+   add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp)
+   target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+   target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
+   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
+endif()
\ No newline at end of file
diff --git a/test/contraction/CMakeLists.txt b/test/contraction/CMakeLists.txt
index 481717000..ec44151f5 100644
--- a/test/contraction/CMakeLists.txt
+++ b/test/contraction/CMakeLists.txt
@@ -1,4 +1,6 @@
 add_gtest_executable(test_contraction test_contraction.cpp)
-add_gtest_executable(test_contraction_interface test_contraction_interface.cpp)
 target_link_libraries(test_contraction PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
-target_link_libraries(test_contraction_interface PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+    add_gtest_executable(test_contraction_interface test_contraction_interface.cpp)
+    target_link_libraries(test_contraction_interface PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
+endif()
diff --git a/test/convnd_bwd_data/CMakeLists.txt b/test/convnd_bwd_data/CMakeLists.txt
index 16ca4de87..4bfd21945 100644
--- a/test/convnd_bwd_data/CMakeLists.txt
+++ b/test/convnd_bwd_data/CMakeLists.txt
@@ -1,2 +1,4 @@
-add_gtest_executable(test_convnd_bwd_data convnd_bwd_data.cpp)
-target_link_libraries(test_convnd_bwd_data PRIVATE utility device_conv1d_bwd_data_instance device_conv2d_bwd_data_instance device_conv3d_bwd_data_instance)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_gtest_executable(test_convnd_bwd_data convnd_bwd_data.cpp)
+   target_link_libraries(test_convnd_bwd_data PRIVATE utility device_conv1d_bwd_data_instance device_conv2d_bwd_data_instance device_conv3d_bwd_data_instance)
+endif()
\ No newline at end of file
diff --git a/test/convnd_fwd/CMakeLists.txt b/test/convnd_fwd/CMakeLists.txt
index 97e170d85..058f99026 100644
--- a/test/convnd_fwd/CMakeLists.txt
+++ b/test/convnd_fwd/CMakeLists.txt
@@ -1,2 +1,4 @@
-add_gtest_executable(test_convnd_fwd convnd_fwd.cpp)
-target_link_libraries(test_convnd_fwd PRIVATE utility device_conv2d_fwd_instance)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_gtest_executable(test_convnd_fwd convnd_fwd.cpp)
+   target_link_libraries(test_convnd_fwd PRIVATE utility device_conv2d_fwd_instance)
+endif()
diff --git a/test/gemm_layernorm/CMakeLists.txt b/test/gemm_layernorm/CMakeLists.txt
index c4feb5c56..b2a5178ac 100644
--- a/test/gemm_layernorm/CMakeLists.txt
+++ b/test/gemm_layernorm/CMakeLists.txt
@@ -1,7 +1,6 @@
-add_custom_target(test_gemm_layernorm)
-
-add_gtest_executable(test_gemm_add_relu_add_layernorm_fp16 test_gemm_add_relu_add_layernorm_fp16.cpp)
-
-target_link_libraries(test_gemm_add_relu_add_layernorm_fp16 PRIVATE utility device_gemm_add_relu_add_layernorm_instance)
-
-add_dependencies(test_gemm_layernorm test_gemm_add_relu_add_layernorm_fp16)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_custom_target(test_gemm_layernorm)
+   add_gtest_executable(test_gemm_add_relu_add_layernorm_fp16 test_gemm_add_relu_add_layernorm_fp16.cpp)
+   target_link_libraries(test_gemm_add_relu_add_layernorm_fp16 PRIVATE utility device_gemm_add_relu_add_layernorm_instance)
+   add_dependencies(test_gemm_layernorm test_gemm_add_relu_add_layernorm_fp16)
+endif()
diff --git a/test/gemm_split_k/CMakeLists.txt b/test/gemm_split_k/CMakeLists.txt
index 793091e53..09bbf7938 100644
--- a/test/gemm_split_k/CMakeLists.txt
+++ b/test/gemm_split_k/CMakeLists.txt
@@ -1,3 +1,5 @@
-add_test_executable(test_gemm_split_k gemm_split_k.cpp)
-target_link_libraries(test_gemm_split_k PRIVATE utility)
-target_link_libraries(test_gemm_split_k PRIVATE device_gemm_splitk_instance)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_test_executable(test_gemm_split_k gemm_split_k.cpp)
+   target_link_libraries(test_gemm_split_k PRIVATE utility)
+   target_link_libraries(test_gemm_split_k PRIVATE device_gemm_splitk_instance)
+endif()
diff --git a/test/grouped_convnd_bwd_weight/CMakeLists.txt b/test/grouped_convnd_bwd_weight/CMakeLists.txt
index e2f0790c8..da554f677 100644
--- a/test/grouped_convnd_bwd_weight/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt
@@ -1,2 +1,4 @@
-add_gtest_executable(test_grouped_convnd_bwd_weight grouped_convnd_bwd_weight.cpp) 
-target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_gtest_executable(test_grouped_convnd_bwd_weight grouped_convnd_bwd_weight.cpp)
+    target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
+endif()
\ No newline at end of file
diff --git a/test/grouped_gemm/CMakeLists.txt b/test/grouped_gemm/CMakeLists.txt
index 31a78733d..a7619eac6 100644
--- a/test/grouped_gemm/CMakeLists.txt
+++ b/test/grouped_gemm/CMakeLists.txt
@@ -1,3 +1,5 @@
-add_test_executable(test_grouped_gemm_fp16 grouped_gemm_fp16.cpp)
-target_link_libraries(test_grouped_gemm_fp16 PRIVATE utility)
-target_link_libraries(test_grouped_gemm_fp16 PRIVATE device_grouped_gemm_instance)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   add_test_executable(test_grouped_gemm_fp16 grouped_gemm_fp16.cpp)
+   target_link_libraries(test_grouped_gemm_fp16 PRIVATE utility)
+   target_link_libraries(test_grouped_gemm_fp16 PRIVATE device_grouped_gemm_instance)
+endif()
-- 
GitLab


From 76ec0089fb254c221138ec8cb2962ba5056f7fa9 Mon Sep 17 00:00:00 2001
From: rocking <ChunYu.Lai@amd.com>
Date: Wed, 24 May 2023 22:05:04 +0800
Subject: [PATCH 41/71] Pool3d fwd (#697)

* Expand the base class of pool2d, prepare to share base class with pool3d

* Add pool3d device op

* Add pool3d f16 example

* Refactor the base class. implement generic pooling in the future

* clang format

* get original index in max pooling

* Add outputindex to base class

* Fix dimension

* Add pooling instance

* Use indexType instead

* Remove useless header

* Extract IndexDataType to template

* Extract pooling reference code

* clang format

* clang format

* Fix typo

* Add tensor stride

* Add missing header

* Add index stride and output stride

* Refine naming

* Add type to base class

* Rename file

* Use proper size

* Fix typo

* Refine naming

* Modify the argument into vector.

* Add max pool profiler

* Refine naming

* Support f32 pool

* Fix typo

* Add avg pool2d fwd in profiler

* clang format

* Rename AccDatatype to ComputeDatatype

* Fix init

* test pool

* Extract variable

* Add client example

* Check the pooling dim

* clang format

* Connect argv and arg_parser

* Add found check

* Remove useless header

* Refine naming

* Adjust the order of device_pool_fwd
---
 .../18_groupnorm/groupnorm_swish.cpp          |   7 +-
 client_example/19_pool_fwd/CMakeLists.txt     |   5 +
 client_example/19_pool_fwd/avg_pool3d_fwd.cpp | 199 ++++++++++
 client_example/19_pool_fwd/max_pool2d_fwd.cpp | 193 ++++++++++
 example/13_pool2d_fwd/pool2d_fwd_common.hpp   | 172 ++-------
 example/13_pool2d_fwd/pool2d_fwd_fp16.cpp     |   9 +-
 example/13_pool2d_fwd/pool2d_fwd_fp32.cpp     |   9 +-
 example/48_pool3d_fwd/CMakeLists.txt          |   2 +
 example/48_pool3d_fwd/pool3d_fwd_common.hpp   | 187 +++++++++
 example/48_pool3d_fwd/pool3d_fwd_fp16.cpp     |  83 ++++
 .../gpu/device/device_pool2d_fwd.hpp          |  40 --
 .../gpu/device/device_pool_fwd.hpp            |  44 +++
 .../impl/device_pool2d_fwd_nhwc_nhwc.hpp      | 111 +++---
 .../impl/device_pool3d_fwd_ndhwc_ndhwc.hpp    | 357 ++++++++++++++++++
 .../device/impl/device_reduce_threadwise.hpp  |   2 +
 .../grid/gridwise_2d_reduction_threadwise.hpp |  36 +-
 .../cpu/reference_pool_fwd.hpp                | 345 +++++++++++++++++
 .../gpu/pool2d_fwd.hpp                        | 111 ++++++
 .../gpu/pool3d_fwd.hpp                        | 111 ++++++
 .../device_reduce_instance_threadwise.hpp     |   1 +
 .../ck/library/utility/host_tensor.hpp        |   6 +
 .../gpu/pool_fwd/CMakeLists.txt               |  10 +
 ...evice_avg_pool2d_fwd_nhwc_f16_instance.cpp |  23 ++
 ...evice_avg_pool2d_fwd_nhwc_f32_instance.cpp |  23 ++
 ...vice_avg_pool3d_fwd_ndhwc_f16_instance.cpp |  23 ++
 ...vice_avg_pool3d_fwd_ndhwc_f32_instance.cpp |  23 ++
 ...evice_max_pool2d_fwd_nhwc_f16_instance.cpp |  30 ++
 ...evice_max_pool2d_fwd_nhwc_f32_instance.cpp |  30 ++
 ...vice_max_pool3d_fwd_ndhwc_f16_instance.cpp |  30 ++
 ...vice_max_pool3d_fwd_ndhwc_f32_instance.cpp |  30 ++
 .../gpu/pool_fwd/pool_fwd_instance_common.hpp |  55 +++
 .../profiler/profile_pool2d_fwd_impl.hpp      | 264 +++++++++++++
 .../profiler/profile_pool3d_fwd_impl.hpp      | 271 +++++++++++++
 profiler/src/CMakeLists.txt                   |   4 +
 profiler/src/profile_avg_pool2d_fwd.cpp       | 141 +++++++
 profiler/src/profile_groupnorm.cpp            |   2 +-
 profiler/src/profile_max_pool3d_fwd.cpp       | 168 +++++++++
 test/CMakeLists.txt                           |   1 +
 test/pool_fwd/CMakeLists.txt                  |  16 +
 test/pool_fwd/test_avg_pool2d_fwd.cpp         |  56 +++
 test/pool_fwd/test_avg_pool3d_fwd.cpp         |  56 +++
 test/pool_fwd/test_max_pool2d_fwd.cpp         |  75 ++++
 test/pool_fwd/test_max_pool3d_fwd.cpp         |  75 ++++
 test/pool_fwd/test_pool_fwd_common.hpp        |  31 ++
 44 files changed, 3226 insertions(+), 241 deletions(-)
 create mode 100644 client_example/19_pool_fwd/CMakeLists.txt
 create mode 100644 client_example/19_pool_fwd/avg_pool3d_fwd.cpp
 create mode 100644 client_example/19_pool_fwd/max_pool2d_fwd.cpp
 create mode 100644 example/48_pool3d_fwd/CMakeLists.txt
 create mode 100644 example/48_pool3d_fwd/pool3d_fwd_common.hpp
 create mode 100644 example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
 delete mode 100644 include/ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/pool_fwd/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp
 create mode 100644 profiler/include/profiler/profile_pool2d_fwd_impl.hpp
 create mode 100644 profiler/include/profiler/profile_pool3d_fwd_impl.hpp
 create mode 100644 profiler/src/profile_avg_pool2d_fwd.cpp
 create mode 100644 profiler/src/profile_max_pool3d_fwd.cpp
 create mode 100644 test/pool_fwd/CMakeLists.txt
 create mode 100644 test/pool_fwd/test_avg_pool2d_fwd.cpp
 create mode 100644 test/pool_fwd/test_avg_pool3d_fwd.cpp
 create mode 100644 test/pool_fwd/test_max_pool2d_fwd.cpp
 create mode 100644 test/pool_fwd/test_max_pool3d_fwd.cpp
 create mode 100644 test/pool_fwd/test_pool_fwd_common.hpp

diff --git a/client_example/18_groupnorm/groupnorm_swish.cpp b/client_example/18_groupnorm/groupnorm_swish.cpp
index a79630c23..84f62ceac 100644
--- a/client_example/18_groupnorm/groupnorm_swish.cpp
+++ b/client_example/18_groupnorm/groupnorm_swish.cpp
@@ -131,11 +131,12 @@ int main(int argc, char* argv[])
         }
     }
 
-    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
-              << best_op_name << std::endl;
-
     // run the best intance
+    if(found)
     {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
+
         auto& op_ptr = op_ptrs[best_op_id];
         std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
                   << std::endl;
diff --git a/client_example/19_pool_fwd/CMakeLists.txt b/client_example/19_pool_fwd/CMakeLists.txt
new file mode 100644
index 000000000..13f9f73c8
--- /dev/null
+++ b/client_example/19_pool_fwd/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_executable(client_max_pool2d_fwd max_pool2d_fwd.cpp)
+target_link_libraries(client_max_pool2d_fwd PRIVATE composable_kernel::device_operations)
+
+add_executable(client_avg_pool3d_fwd avg_pool3d_fwd.cpp)
+target_link_libraries(client_avg_pool3d_fwd PRIVATE composable_kernel::device_operations)
\ No newline at end of file
diff --git a/client_example/19_pool_fwd/avg_pool3d_fwd.cpp b/client_example/19_pool_fwd/avg_pool3d_fwd.cpp
new file mode 100644
index 000000000..47bd7738f
--- /dev/null
+++ b/client_example/19_pool_fwd/avg_pool3d_fwd.cpp
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
+
+using InDataType    = ck::half_t;
+using OutDataType   = ck::half_t;
+using IndexDataType = int32_t;
+
+constexpr ck::index_t InOutRank  = 5;
+constexpr ck::index_t WindowRank = 3;
+#if 0
+constexpr auto ReduceOpId  = ck::ReduceTensorOp::MAX;
+constexpr bool OutputIndex = false;
+#else
+constexpr auto ReduceOpId  = ck::ReduceTensorOp::AVG;
+constexpr bool OutputIndex = false;
+#endif
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    ck::index_t N               = 2;
+    ck::index_t C               = 32;
+    ck::index_t Z               = 2;
+    ck::index_t Y               = 2;
+    ck::index_t X               = 2;
+    ck::index_t Di              = 30;
+    ck::index_t Hi              = 30;
+    ck::index_t Wi              = 30;
+    ck::index_t window_stride_d = 2;
+    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_w = 2;
+    ck::index_t in_left_pad_d   = 1;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_d  = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+
+    ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Z) / window_stride_d + 1;
+    ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
+    ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+
+    // Pool API only support the order of NCDHW
+    std::vector<ck::index_t> in_length              = {N, C, Di, Hi, Wi};
+    std::vector<ck::index_t> out_length             = {N, C, Do, Ho, Wo};
+    std::vector<ck::index_t> window_spatial_lengths = {Z, Y, X};
+    std::vector<ck::index_t> window_strides   = {window_stride_d, window_stride_h, window_stride_w};
+    std::vector<ck::index_t> input_left_pads  = {in_left_pad_d, in_left_pad_h, in_left_pad_w};
+    std::vector<ck::index_t> input_right_pads = {in_right_pad_d, in_right_pad_h, in_right_pad_w};
+
+    std::size_t in_tensor_size  = N * C * Di * Hi * Wi;
+    std::size_t out_tensor_size = N * C * Do * Ho * Wo;
+
+    // tensor layout = NDHWC
+    std::vector<ck::index_t> in_tensor_stride  = {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C};
+    std::vector<ck::index_t> out_tensor_stride = {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C};
+
+    SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
+    SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
+    SimpleDeviceMem out_indices_device_buf(sizeof(IndexDataType) * out_tensor_size);
+
+    using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                                 WindowRank,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 IndexDataType,
+                                                                 ReduceOpId,
+                                                                 OutputIndex>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            in_tensor_stride,
+            out_tensor_stride,
+            out_tensor_stride,
+            window_strides,
+            input_left_pads,
+            input_right_pads,
+            {2, 3, 4});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_bytes =
+                in_tensor_size * sizeof(InDataType) + out_tensor_size * sizeof(OutDataType);
+
+            if constexpr(OutputIndex)
+                num_bytes += out_tensor_size * sizeof(IndexDataType);
+
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    // run the best intance
+    if(found)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
+
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            in_tensor_stride,
+            out_tensor_stride,
+            out_tensor_stride,
+            window_strides,
+            input_left_pads,
+            input_right_pads,
+            {2, 3, 4});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/client_example/19_pool_fwd/max_pool2d_fwd.cpp b/client_example/19_pool_fwd/max_pool2d_fwd.cpp
new file mode 100644
index 000000000..12ee61920
--- /dev/null
+++ b/client_example/19_pool_fwd/max_pool2d_fwd.cpp
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp"
+
+using InDataType    = ck::half_t;
+using OutDataType   = ck::half_t;
+using IndexDataType = int32_t;
+
+constexpr ck::index_t InOutRank  = 4;
+constexpr ck::index_t WindowRank = 2;
+#if 1
+constexpr auto ReduceOpId  = ck::ReduceTensorOp::MAX;
+constexpr bool OutputIndex = true;
+#else
+constexpr auto ReduceOpId  = ck::ReduceTensorOp::AVG;
+constexpr bool OutputIndex = false;
+#endif
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    ck::index_t N               = 2;
+    ck::index_t C               = 32;
+    ck::index_t Y               = 2;
+    ck::index_t X               = 2;
+    ck::index_t Hi              = 30;
+    ck::index_t Wi              = 30;
+    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_w = 2;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+
+    ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
+    ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+
+    // Pool API only support the order of NCHW
+    std::vector<ck::index_t> in_length              = {N, C, Hi, Wi};
+    std::vector<ck::index_t> out_length             = {N, C, Ho, Wo};
+    std::vector<ck::index_t> window_spatial_lengths = {Y, X};
+    std::vector<ck::index_t> window_strides         = {window_stride_h, window_stride_w};
+    std::vector<ck::index_t> input_left_pads        = {in_left_pad_h, in_left_pad_w};
+    std::vector<ck::index_t> input_right_pads       = {in_right_pad_h, in_right_pad_w};
+
+    std::size_t in_tensor_size  = N * C * Hi * Wi;
+    std::size_t out_tensor_size = N * C * Ho * Wo;
+
+    // tensor layout = NHWC
+    std::vector<ck::index_t> in_tensor_stride  = {C * Hi * Wi, 1, Wi * C, C};
+    std::vector<ck::index_t> out_tensor_stride = {C * Ho * Wo, 1, Wo * C, C};
+
+    SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
+    SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
+    SimpleDeviceMem out_indices_device_buf(sizeof(IndexDataType) * out_tensor_size);
+
+    using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                                 WindowRank,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 IndexDataType,
+                                                                 ReduceOpId,
+                                                                 OutputIndex>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            in_tensor_stride,
+            out_tensor_stride,
+            out_tensor_stride,
+            window_strides,
+            input_left_pads,
+            input_right_pads,
+            {2, 3});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_bytes =
+                in_tensor_size * sizeof(InDataType) + out_tensor_size * sizeof(OutDataType);
+
+            if constexpr(OutputIndex)
+                num_bytes += out_tensor_size * sizeof(IndexDataType);
+
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    // run the best intance
+    if(found)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
+
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            in_tensor_stride,
+            out_tensor_stride,
+            out_tensor_stride,
+            window_strides,
+            input_left_pads,
+            input_right_pads,
+            {2, 3});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/example/13_pool2d_fwd/pool2d_fwd_common.hpp b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
index b83cb6a96..9abc98671 100644
--- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
@@ -17,115 +17,11 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
 
 template <typename InDataType,
           typename OutDataType,
-          typename AccDataType,
-          typename IndexDataType,
-          ck::ReduceTensorOp ReduceOpId,
-          bool PropagateNan,
-          bool OutputIndex>
-static void pool_host_verify(const Tensor<InDataType>& in,
-                             Tensor<OutDataType>& out,
-                             Tensor<IndexDataType>& out_indices,
-                             const std::array<ck::index_t, 2>& window_spatial_lengths,
-                             const std::array<ck::index_t, 2>& window_strides,
-                             const std::array<ck::index_t, 2>& in_left_pads,
-                             const std::array<ck::index_t, 2>& /*in_right_pads*/)
-{
-    const int32_t reduceLength = window_spatial_lengths[0] * window_spatial_lengths[1];
-
-    using ReduceOperation = typename ck::reduce_binary_operator<ReduceOpId>::opType;
-
-    auto elementwise_ops =
-        ck::reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(reduceLength);
-
-    auto in_elementwise_op  = std::get<0>(elementwise_ops);
-    auto acc_elementwise_op = std::get<1>(elementwise_ops);
-
-    if constexpr(!OutputIndex)
-    {
-        using Accumulation =
-            ck::detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
-
-        auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
-            auto accuVal = ReduceOperation::template GetIdentityValue<AccDataType>();
-
-            for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y)
-            {
-                ck::index_t hi = ho * window_strides[0] + y - in_left_pads[0];
-                for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x)
-                {
-                    ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1];
-                    if(hi >= 0 && hi < static_cast<ck::index_t>(in.mDesc.GetLengths()[2]) &&
-                       wi >= 0 && wi < static_cast<ck::index_t>(in.mDesc.GetLengths()[3]))
-                    {
-                        AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
-
-                        in_elementwise_op(currVal, currVal);
-
-                        Accumulation::Calculate(accuVal, currVal);
-                    }
-                }
-            }
-
-            acc_elementwise_op(accuVal, accuVal);
-
-            out(n, c, ho, wo) = accuVal;
-        };
-
-        make_ParallelTensorFunctor(f_nchw,
-                                   out.mDesc.GetLengths()[0],
-                                   out.mDesc.GetLengths()[1],
-                                   out.mDesc.GetLengths()[2],
-                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    }
-    else
-    {
-        using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck<PropagateNan,
-                                                                        ReduceOperation,
-                                                                        AccDataType,
-                                                                        IndexDataType>;
-        auto f_nchw        = [&](auto n, auto c, auto ho, auto wo) {
-            auto accuVal            = ReduceOperation::template GetIdentityValue<AccDataType>();
-            IndexDataType accuIndex = 0;
-
-            for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y)
-            {
-                ck::index_t hi = ho * window_strides[0] + y - in_left_pads[0];
-                for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x)
-                {
-                    ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[3])
-                    {
-                        AccDataType currVal     = static_cast<AccDataType>(in(n, c, hi, wi));
-                        IndexDataType currIndex = y * window_spatial_lengths[1] + x;
-
-                        in_elementwise_op(currVal, currVal);
-
-                        Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
-                    }
-                }
-            }
-
-            acc_elementwise_op(accuVal, accuVal);
-
-            out(n, c, ho, wo)         = accuVal;
-            out_indices(n, c, ho, wo) = accuIndex;
-        };
-
-        make_ParallelTensorFunctor(f_nchw,
-                                   out.mDesc.GetLengths()[0],
-                                   out.mDesc.GetLengths()[1],
-                                   out.mDesc.GetLengths()[2],
-                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    };
-}
-
-template <typename InDataType,
-          typename OutDataType,
-          typename AccDataType,
+          typename ComputeDataType,
           typename IndexDataType,
           typename InLayout,
           typename OutLayout,
@@ -150,9 +46,10 @@ bool pool_test(bool do_verification,
 {
     using DevicePoolFwdInstance =
         ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
-            InDataType,  // InDataType
-            OutDataType, // OutDataType
-            AccDataType, // AccDataType
+            InDataType,      // InDataType
+            OutDataType,     // OutDataType
+            IndexDataType,   // IndexDataType
+            ComputeDataType, // ComputeDataType
             ReduceOpId,
             OutputIndex,
             64, // BlockSize
@@ -165,10 +62,10 @@ bool pool_test(bool do_verification,
     const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
     const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
 
-    const std::array<ck::index_t, 2> window_spatial_lengths{{Y, X}};
-    const std::array<ck::index_t, 2> window_strides{{window_stride_h, window_stride_w}};
-    const std::array<ck::index_t, 2> input_left_pads{{in_left_pad_h, in_left_pad_w}};
-    const std::array<ck::index_t, 2> input_right_pads{{in_right_pad_h, in_right_pad_w}};
+    const std::vector<ck::index_t> window_spatial_lengths{Y, X};
+    const std::vector<ck::index_t> window_strides{window_stride_h, window_stride_w};
+    const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
+    const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
 
     // tensor layout
     auto f_host_tensor_descriptor =
@@ -219,14 +116,16 @@ bool pool_test(bool do_verification,
         static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
         static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
         static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
-        N,
-        C,
-        std::array<ck::index_t, 2>{{Hi, Wi}},
-        std::array<ck::index_t, 2>{{Y, X}},
-        std::array<ck::index_t, 2>{{Ho, Wo}},
+        {N, C, Hi, Wi},
+        {Y, X},
+        {N, C, Ho, Wo},
+        {C * Hi * Wi, 1, Wi * C, C},
+        {C * Ho * Wo, 1, Wo * C, C},
+        {C * Ho * Wo, 1, Wo * C, C},
         window_strides,
         input_left_pads,
-        input_right_pads);
+        input_right_pads,
+        {2, 3});
 
     if(!pool.IsSupportedArgument(argument_ptr.get()))
     {
@@ -252,19 +151,28 @@ bool pool_test(bool do_verification,
 
     if(do_verification)
     {
-        pool_host_verify<InDataType,
-                         OutDataType,
-                         AccDataType,
-                         IndexDataType,
-                         ReduceOpId,
-                         PropagateNan,
-                         OutputIndex>(in_n_c_hi_wi,
-                                      out_n_c_ho_wo_host,
-                                      out_indices_n_c_ho_wo_host,
-                                      window_spatial_lengths,
-                                      window_strides,
-                                      input_left_pads,
-                                      input_right_pads);
+        using ReferencePoolingFwdInstance =
+            ck::tensor_operation::host::ReferencePoolingFwd<4,
+                                                            2,
+                                                            InDataType,
+                                                            OutDataType,
+                                                            ComputeDataType,
+                                                            IndexDataType,
+                                                            ReduceOpId,
+                                                            PropagateNan,
+                                                            OutputIndex>;
+
+        auto ref_pooling          = ReferencePoolingFwdInstance{};
+        auto ref_pooling_invoker  = ref_pooling.MakeInvoker();
+        auto ref_pooling_argument = ref_pooling.MakeArgument(in_n_c_hi_wi,
+                                                             out_n_c_ho_wo_host,
+                                                             out_indices_n_c_ho_wo_host,
+                                                             window_spatial_lengths,
+                                                             window_strides,
+                                                             input_left_pads,
+                                                             input_right_pads);
+
+        ref_pooling_invoker.Run(ref_pooling_argument);
 
         out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
 
diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
index 659f3251d..20c3e4701 100644
--- a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
@@ -2,7 +2,6 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
-#include <cstdlib>
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -10,9 +9,9 @@
 
 #include "pool2d_fwd_common.hpp"
 
-using InDataType  = ck::half_t;
-using OutDataType = ck::half_t;
-using AccDataType = float;
+using InDataType      = ck::half_t;
+using OutDataType     = ck::half_t;
+using ComputeDataType = float;
 
 using IndexDataType = int32_t;
 
@@ -91,7 +90,7 @@ int main(int argc, char* argv[])
 
     bool pass = pool_test<InDataType,
                           OutDataType,
-                          AccDataType,
+                          ComputeDataType,
                           IndexDataType,
                           InLayout,
                           OutLayout,
diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
index f47c7ff15..34ff6f435 100644
--- a/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
@@ -2,7 +2,6 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
-#include <cstdlib>
 
 #include "ck/ck.hpp"
 #include "ck/utility/reduction_enums.hpp"
@@ -10,9 +9,9 @@
 
 #include "pool2d_fwd_common.hpp"
 
-using InDataType  = float;
-using OutDataType = float;
-using AccDataType = float;
+using InDataType      = float;
+using OutDataType     = float;
+using ComputeDataType = float;
 
 using IndexDataType = int32_t;
 
@@ -91,7 +90,7 @@ int main(int argc, char* argv[])
 
     bool pass = pool_test<InDataType,
                           OutDataType,
-                          AccDataType,
+                          ComputeDataType,
                           IndexDataType,
                           InLayout,
                           OutLayout,
diff --git a/example/48_pool3d_fwd/CMakeLists.txt b/example/48_pool3d_fwd/CMakeLists.txt
new file mode 100644
index 000000000..5d58d6a0b
--- /dev/null
+++ b/example/48_pool3d_fwd/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_example_executable(example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp)
+
diff --git a/example/48_pool3d_fwd/pool3d_fwd_common.hpp b/example/48_pool3d_fwd/pool3d_fwd_common.hpp
new file mode 100644
index 000000000..5706deb6d
--- /dev/null
+++ b/example/48_pool3d_fwd/pool3d_fwd_common.hpp
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
+
+template <typename InDataType,
+          typename OutDataType,
+          typename ComputeDataType,
+          typename IndexDataType,
+          typename InLayout,
+          typename OutLayout,
+          ck::ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool OutputIndex>
+bool pool3d_test(bool do_verification,
+                 bool time_kernel,
+                 ck::index_t N,
+                 ck::index_t C,
+                 ck::index_t Z,
+                 ck::index_t Y,
+                 ck::index_t X,
+                 ck::index_t Di,
+                 ck::index_t Hi,
+                 ck::index_t Wi,
+                 ck::index_t window_stride_d,
+                 ck::index_t window_stride_h,
+                 ck::index_t window_stride_w,
+                 ck::index_t in_left_pad_d,
+                 ck::index_t in_left_pad_h,
+                 ck::index_t in_left_pad_w,
+                 ck::index_t in_right_pad_d,
+                 ck::index_t in_right_pad_h,
+                 ck::index_t in_right_pad_w)
+{
+    using DevicePoolFwdInstance =
+        ck::tensor_operation::device::DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C<
+            InDataType,      // InDataType
+            OutDataType,     // OutDataType
+            IndexDataType,   // IndexDataType
+            ComputeDataType, // ComputeDataType
+            ReduceOpId,
+            OutputIndex,
+            64, // BlockSize
+            64, // ReduceMThreadClusterSize
+            1,  // ReduceKThreadClusterSize
+            4,  // ReduceMThreadSliceSize
+            1,  // ReduceKThreadSliceSize
+            4>; // InSrcOutDstVectorSize
+
+    const ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Z) / window_stride_d + 1;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+
+    const std::vector<ck::index_t> window_spatial_lengths{Z, Y, X};
+    const std::vector<ck::index_t> window_strides{
+        window_stride_d, window_stride_h, window_stride_w};
+    const std::vector<ck::index_t> input_left_pads{in_left_pad_d, in_left_pad_h, in_left_pad_w};
+    const std::vector<ck::index_t> input_right_pads{in_right_pad_d, in_right_pad_h, in_right_pad_w};
+
+    // tensor layout
+    auto f_host_tensor_descriptor = [](std::size_t N_,
+                                       std::size_t C_,
+                                       std::size_t D,
+                                       std::size_t H,
+                                       std::size_t W,
+                                       auto layout) {
+        using namespace ck::literals;
+
+        if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
+        {
+            return HostTensorDescriptor({N_, C_, D, H, W},
+                                        {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
+        }
+        else if constexpr(ck::is_same<decltype(layout),
+                                      ck::tensor_layout::convolution::NDHWC>::value)
+        {
+            return HostTensorDescriptor({N_, C_, D, H, W},
+                                        {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
+        }
+    };
+
+    Tensor<InDataType> in_n_c_di_hi_wi(f_host_tensor_descriptor(N, C, Di, Hi, Wi, InLayout{}));
+    Tensor<OutDataType> out_n_c_do_ho_wo_host(
+        f_host_tensor_descriptor(N, C, Do, Ho, Wo, OutLayout{}));
+    Tensor<IndexDataType> out_indices_n_c_do_ho_wo_host(
+        f_host_tensor_descriptor(N, C, Do, Ho, Wo, OutLayout{}));
+    Tensor<OutDataType> out_n_c_do_ho_wo_device(
+        f_host_tensor_descriptor(N, C, Do, Ho, Wo, OutLayout{}));
+    Tensor<IndexDataType> out_indices_n_c_do_ho_wo_device(
+        f_host_tensor_descriptor(N, C, Do, Ho, Wo, OutLayout{}));
+
+    std::cout << "in_n_c_di_hi_wi: " << in_n_c_di_hi_wi.mDesc << std::endl;
+    std::cout << "out_n_c_do_ho_wo: " << out_n_c_do_ho_wo_host.mDesc << std::endl;
+
+    in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_di_hi_wi.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_c_do_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
+                                     out_indices_n_c_do_ho_wo_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in_n_c_di_hi_wi.mData.data());
+
+    auto pool         = DevicePoolFwdInstance{};
+    auto invoker_ptr  = pool.MakeInvokerPointer();
+    auto argument_ptr = pool.MakeArgumentPointer(
+        static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+        static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+        {N, C, Di, Hi, Wi},
+        {Z, Y, X},
+        {N, C, Do, Ho, Wo},
+        {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C},
+        {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
+        {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
+        window_strides,
+        input_left_pads,
+        input_right_pads,
+        {2, 3, 4});
+
+    if(!pool.IsSupportedArgument(argument_ptr.get()))
+    {
+        throw std::runtime_error("wrong! device_op with the specified compilation parameters does "
+                                 "not support this problem");
+    }
+
+    float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    std::cout << "Perf: " << ave_time << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        using ReferencePoolingFwdInstance =
+            ck::tensor_operation::host::ReferencePoolingFwd<5,
+                                                            3,
+                                                            InDataType,
+                                                            OutDataType,
+                                                            ComputeDataType,
+                                                            IndexDataType,
+                                                            ReduceOpId,
+                                                            PropagateNan,
+                                                            OutputIndex>;
+
+        auto ref_pooling          = ReferencePoolingFwdInstance{};
+        auto ref_pooling_invoker  = ref_pooling.MakeInvoker();
+        auto ref_pooling_argument = ref_pooling.MakeArgument(in_n_c_di_hi_wi,
+                                                             out_n_c_do_ho_wo_host,
+                                                             out_indices_n_c_do_ho_wo_host,
+                                                             window_spatial_lengths,
+                                                             window_strides,
+                                                             input_left_pads,
+                                                             input_right_pads);
+
+        ref_pooling_invoker.Run(ref_pooling_argument);
+
+        out_device_buf.FromDevice(out_n_c_do_ho_wo_device.mData.data());
+
+        pass = pass && ck::utils::check_err(out_n_c_do_ho_wo_device, out_n_c_do_ho_wo_host);
+
+        if constexpr(OutputIndex)
+        {
+            out_indices_device_buf.FromDevice(out_indices_n_c_do_ho_wo_device.mData.data());
+
+            pass = pass && ck::utils::check_err(out_indices_n_c_do_ho_wo_device,
+                                                out_indices_n_c_do_ho_wo_host);
+        };
+    }
+
+    return (pass);
+};
diff --git a/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp b/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
new file mode 100644
index 000000000..4d3686bcb
--- /dev/null
+++ b/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "pool3d_fwd_common.hpp"
+
+using InDataType      = ck::half_t;
+using OutDataType     = ck::half_t;
+using ComputeDataType = float;
+
+using IndexDataType = int32_t;
+
+using InLayout  = ck::tensor_layout::convolution::NDHWC;
+using OutLayout = ck::tensor_layout::convolution::NDHWC;
+
+#if 1
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+#else
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+#endif
+
+static constexpr bool OutputIndex  = false;
+static constexpr bool PropagateNan = false;
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // Pool shape
+    ck::index_t N               = 2;
+    ck::index_t C               = 32;
+    ck::index_t Z               = 2;
+    ck::index_t Y               = 2;
+    ck::index_t X               = 2;
+    ck::index_t Di              = 30;
+    ck::index_t Hi              = 30;
+    ck::index_t Wi              = 30;
+    ck::index_t window_stride_d = 2;
+    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_w = 2;
+    ck::index_t in_left_pad_d   = 1;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_d  = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+
+    bool pass = pool3d_test<InDataType,
+                            OutDataType,
+                            ComputeDataType,
+                            IndexDataType,
+                            InLayout,
+                            OutLayout,
+                            ReduceOpId,
+                            PropagateNan,
+                            OutputIndex>(do_verification,
+                                         time_kernel,
+                                         N,
+                                         C,
+                                         Z,
+                                         Y,
+                                         X,
+                                         Di,
+                                         Hi,
+                                         Wi,
+                                         window_stride_d,
+                                         window_stride_h,
+                                         window_stride_w,
+                                         in_left_pad_d,
+                                         in_left_pad_h,
+                                         in_left_pad_w,
+                                         in_right_pad_d,
+                                         in_right_pad_h,
+                                         in_right_pad_w);
+
+    return (pass ? 0 : 1);
+}
diff --git a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp
deleted file mode 100644
index 3b376c6f7..000000000
--- a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <iostream>
-#include <array>
-
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
-#include "ck/utility/reduction_enums.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-
-template <ck::ReduceTensorOp ReduceOpId>
-struct DevicePool2dFwd : public BaseOperator
-{
-    virtual std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const void* in_dev,
-                        void* out_dev,
-                        void* out_indices_dev,
-                        ck::index_t N,
-                        ck::index_t C,
-                        std::array<ck::index_t, 2> input_spatial_lengths,
-                        std::array<ck::index_t, 2> window_spatial_lengths,
-                        std::array<ck::index_t, 2> output_spatial_lengths,
-                        std::array<ck::index_t, 2> window_strides,
-                        std::array<ck::index_t, 2> input_left_pads,
-                        std::array<ck::index_t, 2> input_right_pads) = 0;
-
-    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
-};
-
-template <ck::ReduceTensorOp ReduceOpId>
-using DevicePool2dFwdPtr = std::unique_ptr<DevicePool2dFwd<ReduceOpId>>;
-
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp b/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
new file mode 100644
index 000000000..e801e98a2
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t InOutRank,
+          index_t WindowRank,
+          typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          ReduceTensorOp ReduceOpId,
+          bool OutputIndex>
+struct DevicePoolFwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in_dev,
+                        void* p_out_dev,
+                        void* p_out_indices_dev,
+                        std::vector<ck::index_t> input_lengths,
+                        std::vector<ck::index_t> window_lengths,
+                        std::vector<ck::index_t> output_lengths,
+                        std::vector<ck::index_t> input_stride,
+                        std::vector<ck::index_t> output_stride,
+                        std::vector<ck::index_t> indices_stride,
+                        std::vector<ck::index_t> window_strides,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        std::vector<ck::index_t> pooling_dims) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp b/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
index bfde40cda..6933db68d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
@@ -9,7 +9,7 @@
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
-#include "ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp"
+#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
@@ -20,16 +20,18 @@ namespace device {
 
 template <typename InDataType,
           typename OutDataType,
-          typename AccDataType,
+          typename IndexDataType, // enable if OutputIndex == true
+          typename ComputeDataType,
           ck::ReduceTensorOp ReduceOpId,
-          bool OuputIndex,
+          bool OutputIndex,
           ck::index_t BlockSize,
           ck::index_t ReduceMThreadClusterSize,
           ck::index_t ReduceKThreadClusterSize,
           ck::index_t ReduceMThreadSliceSize,
           ck::index_t ReduceKThreadSliceSize,
           ck::index_t InSrcOutDstVectorSize>
-struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd<ReduceOpId>
+struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C
+    : public DevicePoolFwd<4, 2, InDataType, OutDataType, IndexDataType, ReduceOpId, OutputIndex>
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -38,7 +40,8 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
     static constexpr auto I4 = Number<4>{};
     static constexpr auto I5 = Number<5>{};
 
-    using IndexDataType = int32_t;
+    static constexpr index_t InOutRank  = 4;
+    static constexpr index_t WindowRank = 2;
 
     using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
 
@@ -59,12 +62,12 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
 
     static auto MakeABGridDescriptor_A_M_K_B_M(ck::index_t N,
                                                ck::index_t C,
-                                               std::array<ck::index_t, 2> input_spatial_lengths,
-                                               std::array<ck::index_t, 2> window_spatial_lengths,
-                                               std::array<ck::index_t, 2> output_spatial_lengths,
-                                               std::array<ck::index_t, 2> window_strides,
-                                               std::array<ck::index_t, 2> input_left_pads,
-                                               std::array<ck::index_t, 2> input_right_pads)
+                                               std::vector<ck::index_t> input_spatial_lengths,
+                                               std::vector<ck::index_t> window_spatial_lengths,
+                                               std::vector<ck::index_t> output_spatial_lengths,
+                                               std::vector<ck::index_t> window_strides,
+                                               std::vector<ck::index_t> input_left_pads,
+                                               std::vector<ck::index_t> input_right_pads)
     {
         const index_t Hi = input_spatial_lengths[0];
         const index_t Wi = input_spatial_lengths[1];
@@ -141,9 +144,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
         return make_tuple(in_grid_desc_reducem_reducek, out_grid_desc_reducem);
     }
 
-    using ABGridDescs = decltype(
-        MakeABGridDescriptor_A_M_K_B_M(1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}));
-
+    using ABGridDescs   = decltype(MakeABGridDescriptor_A_M_K_B_M(1, 1, {}, {}, {}, {}, {}, {}));
     using AGridDesc_M_K = remove_cvref_t<decltype(ABGridDescs{}[I0])>;
     using BGridDesc_M   = remove_cvref_t<decltype(ABGridDescs{}[I1])>;
 
@@ -152,15 +153,15 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
     {
         Argument(const InDataType* p_in_dev,
                  OutDataType* p_out_dev,
-                 int* p_out_indices_dev,
+                 IndexDataType* p_out_indices_dev,
                  ck::index_t N,
                  ck::index_t C,
-                 std::array<ck::index_t, 2>& input_spatial_lengths,
-                 std::array<ck::index_t, 2>& window_spatial_lengths,
-                 std::array<ck::index_t, 2>& output_spatial_lengths,
-                 std::array<ck::index_t, 2>& window_strides,
-                 std::array<ck::index_t, 2>& input_left_pads,
-                 std::array<ck::index_t, 2>& input_right_pads)
+                 std::vector<ck::index_t>& input_spatial_lengths,
+                 std::vector<ck::index_t>& window_spatial_lengths,
+                 std::vector<ck::index_t>& output_spatial_lengths,
+                 std::vector<ck::index_t>& window_strides,
+                 std::vector<ck::index_t>& input_left_pads,
+                 std::vector<ck::index_t>& input_right_pads)
             : p_in_dev_{p_in_dev},
               p_out_dev_{p_out_dev},
               p_out_indices_dev_{p_out_indices_dev},
@@ -190,7 +191,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
 
         const InDataType* p_in_dev_;
         OutDataType* p_out_dev_;
-        int* p_out_indices_dev_;
+        IndexDataType* p_out_indices_dev_;
         AGridDesc_M_K a_grid_desc_m_k_;
         BGridDesc_M b_grid_desc_m_;
         InElementwiseOperation in_element_op_;
@@ -208,7 +209,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
             using gridwise_reduce =
                 GridwiseReduction_mk_to_m_threadwise<InDataType,
                                                      OutDataType,
-                                                     AccDataType,
+                                                     ComputeDataType,
                                                      IndexDataType,
                                                      AGridDesc_M_K,
                                                      BGridDesc_M,
@@ -224,17 +225,19 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
                                                      InSrcOutDstVectorSize,
                                                      InSrcOutDstVectorSize>;
 
-            const auto kernel = kernel_reduce_threadwise<gridwise_reduce,
-                                                         OuputIndex,
-                                                         false, // don't have index input
-                                                         InDataType,
-                                                         OutDataType,
-                                                         AccDataType,
-                                                         IndexDataType,
-                                                         AGridDesc_M_K,
-                                                         BGridDesc_M,
-                                                         InElementwiseOperation,
-                                                         AccElementwiseOperation>;
+            const auto kernel =
+                kernel_reduce_threadwise<gridwise_reduce,
+                                         OutputIndex,
+                                         true,  // pooling need to return global index
+                                         false, // don't have index input
+                                         InDataType,
+                                         OutDataType,
+                                         ComputeDataType,
+                                         IndexDataType,
+                                         AGridDesc_M_K,
+                                         BGridDesc_M,
+                                         InElementwiseOperation,
+                                         AccElementwiseOperation>;
 
             ck::index_t ReduceM = arg.a_grid_desc_m_k_.GetLength(I0);
 
@@ -280,22 +283,42 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
     MakeArgumentPointer(const void* p_in_dev,
                         void* p_out_dev,
                         void* p_out_indices_dev,
-                        ck::index_t N,
-                        ck::index_t C,
-                        std::array<ck::index_t, 2> input_spatial_lengths,
-                        std::array<ck::index_t, 2> window_spatial_lengths,
-                        std::array<ck::index_t, 2> output_spatial_lengths,
-                        std::array<ck::index_t, 2> window_strides,
-                        std::array<ck::index_t, 2> input_left_pads,
-                        std::array<ck::index_t, 2> input_right_pads) override
+                        std::vector<ck::index_t> input_lengths,
+                        std::vector<ck::index_t> window_lengths,
+                        std::vector<ck::index_t> output_lengths,
+                        std::vector<ck::index_t>, // Suppose tensor layout = NHWC
+                        std::vector<ck::index_t>, // Suppose tensor layout = NHWC
+                        std::vector<ck::index_t>, // Suppose tensor layout = NHWC
+                        std::vector<ck::index_t> window_strides,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        std::vector<ck::index_t> pooling_dims) override
     {
+        if(input_lengths.size() != InOutRank || window_lengths.size() != WindowRank ||
+           input_lengths.size() != InOutRank || window_strides.size() != WindowRank ||
+           input_left_pads.size() != WindowRank || input_right_pads.size() != WindowRank)
+            throw std::runtime_error("dimension is incorrect");
+
+        if(pooling_dims != std::vector<ck::index_t>{2, 3})
+            throw std::runtime_error("pooling_dims only support {2, 3} in pool2d so far");
+
+        index_t N  = input_lengths[0];
+        index_t C  = input_lengths[1];
+        index_t Hi = input_lengths[2];
+        index_t Wi = input_lengths[3];
+        index_t Ho = output_lengths[2];
+        index_t Wo = output_lengths[3];
+
+        std::vector<ck::index_t> input_spatial_lengths  = {Hi, Wi};
+        std::vector<ck::index_t> output_spatial_lengths = {Ho, Wo};
+
         return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_dev),
                                           static_cast<OutDataType*>(p_out_dev),
-                                          static_cast<int*>(p_out_indices_dev),
+                                          static_cast<IndexDataType*>(p_out_indices_dev),
                                           N,
                                           C,
                                           input_spatial_lengths,
-                                          window_spatial_lengths,
+                                          window_lengths,
                                           output_spatial_lengths,
                                           window_strides,
                                           input_left_pads,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp b/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
new file mode 100644
index 000000000..d330fda8c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
@@ -0,0 +1,357 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType, // enable if OutputIndex == true
+          typename ComputeDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          bool OutputIndex,
+          ck::index_t BlockSize,
+          ck::index_t MThreadClusterSize,
+          ck::index_t KThreadClusterSize,
+          ck::index_t MThreadSliceSize,
+          ck::index_t KThreadSliceSize,
+          ck::index_t InSrcOutDstVectorSize>
+struct DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C
+    : public DevicePoolFwd<5, 3, InDataType, OutDataType, IndexDataType, ReduceOpId, OutputIndex>
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+
+    static constexpr index_t InOutRank  = 5;
+    static constexpr index_t WindowRank = 3;
+
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
+
+    using InElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
+
+    using AccElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
+
+    // for NDHWC, the dim C is the vector Dim for both input and output in memory, which is not
+    // reduced.
+    static constexpr index_t InSrcOutDstVectorDim = 0;
+
+    static constexpr ck::index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr ck::index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    static auto MakeABGridDescriptor_A_M_K_B_M(ck::index_t N,
+                                               ck::index_t C,
+                                               std::vector<ck::index_t> input_spatial_lengths,
+                                               std::vector<ck::index_t> window_spatial_lengths,
+                                               std::vector<ck::index_t> output_spatial_lengths,
+                                               std::vector<ck::index_t> window_strides,
+                                               std::vector<ck::index_t> input_left_pads,
+                                               std::vector<ck::index_t> input_right_pads)
+    {
+        const index_t Di = input_spatial_lengths[0];
+        const index_t Hi = input_spatial_lengths[1];
+        const index_t Wi = input_spatial_lengths[2];
+
+        const index_t Do = output_spatial_lengths[0];
+        const index_t Ho = output_spatial_lengths[1];
+        const index_t Wo = output_spatial_lengths[2];
+
+        const index_t Z = window_spatial_lengths[0];
+        const index_t Y = window_spatial_lengths[1];
+        const index_t X = window_spatial_lengths[2];
+
+        const index_t ConvStrideD = window_strides[0];
+        const index_t ConvStrideH = window_strides[1];
+        const index_t ConvStrideW = window_strides[2];
+
+        const index_t InLeftPadD = input_left_pads[0];
+        const index_t InLeftPadH = input_left_pads[1];
+        const index_t InLeftPadW = input_left_pads[2];
+
+        const index_t InRightPadD = input_right_pads[0];
+        const index_t InRightPadH = input_right_pads[1];
+        const index_t InRightPadW = input_right_pads[2];
+
+        const index_t MRaw = N * Do * Ho * Wo * C;
+        const index_t MPad = math::integer_least_multiple(MRaw, M_BlockTileSize) - MRaw;
+
+        const index_t KRaw = Z * Y * X;
+        const index_t KPad = math::integer_least_multiple(KRaw, K_BlockTileSize) - KRaw;
+
+        // A[ReduceM, ReduceK]
+        const auto in_grid_desc_n_di_hi_wi_c =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
+
+        const auto in_grid_desc_n_dip_hip_wip_c = transform_tensor_descriptor(
+            in_grid_desc_n_di_hi_wi_c,
+            make_tuple(make_pass_through_transform(N),
+                       make_pad_transform(Di, InLeftPadD, InRightPadD),
+                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                       make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+        const auto in_grid_desc_n_z_do_y_ho_x_wo_c = transform_tensor_descriptor(
+            in_grid_desc_n_dip_hip_wip_c,
+            make_tuple(make_pass_through_transform(N),
+                       make_embed_transform(make_tuple(Z, Do), make_tuple(I1, ConvStrideD)),
+                       make_embed_transform(make_tuple(Y, Ho), make_tuple(I1, ConvStrideH)),
+                       make_embed_transform(make_tuple(X, Wo), make_tuple(I1, ConvStrideW)),
+                       make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(Sequence<0>{},
+                       Sequence<1, 2>{},
+                       Sequence<3, 4>{},
+                       Sequence<5, 6>{},
+                       Sequence<7>{}));
+
+        const auto in_grid_desc_reducemraw_reducekraw = transform_tensor_descriptor(
+            in_grid_desc_n_z_do_y_ho_x_wo_c,
+            make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo, C)),
+                       make_merge_transform(make_tuple(Z, Y, X))),
+            make_tuple(Sequence<0, 2, 4, 6, 7>{}, Sequence<1, 3, 5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        const auto in_grid_desc_reducem_reducek = transform_tensor_descriptor(
+            in_grid_desc_reducemraw_reducekraw,
+            make_tuple(make_right_pad_transform(MRaw, MPad), make_right_pad_transform(KRaw, KPad)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        // B[ReduceM]
+        const auto out_grid_desc_reducemraw =
+            make_naive_tensor_descriptor_packed(make_tuple(N * Do * Ho * Wo * C));
+
+        const auto out_grid_desc_reducem =
+            transform_tensor_descriptor(out_grid_desc_reducemraw,
+                                        make_tuple(make_right_pad_transform(MRaw, MPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+
+        return make_tuple(in_grid_desc_reducem_reducek, out_grid_desc_reducem);
+    }
+
+    using ABGridDescs   = decltype(MakeABGridDescriptor_A_M_K_B_M(1, 1, {}, {}, {}, {}, {}, {}));
+    using AGridDesc_M_K = remove_cvref_t<decltype(ABGridDescs{}[I0])>;
+    using BGridDesc_M   = remove_cvref_t<decltype(ABGridDescs{}[I1])>;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_in_dev,
+                 OutDataType* p_out_dev,
+                 IndexDataType* p_out_indices_dev,
+                 ck::index_t N,
+                 ck::index_t C,
+                 std::vector<ck::index_t>& input_spatial_lengths,
+                 std::vector<ck::index_t>& window_spatial_lengths,
+                 std::vector<ck::index_t>& output_spatial_lengths,
+                 std::vector<ck::index_t>& window_strides,
+                 std::vector<ck::index_t>& input_left_pads,
+                 std::vector<ck::index_t>& input_right_pads)
+            : p_in_dev_{p_in_dev},
+              p_out_dev_{p_out_dev},
+              p_out_indices_dev_{p_out_indices_dev},
+              a_grid_desc_m_k_{},
+              b_grid_desc_m_{}
+        {
+            const auto descs = MakeABGridDescriptor_A_M_K_B_M(N,
+                                                              C,
+                                                              input_spatial_lengths,
+                                                              window_spatial_lengths,
+                                                              output_spatial_lengths,
+                                                              window_strides,
+                                                              input_left_pads,
+                                                              input_right_pads);
+
+            a_grid_desc_m_k_ = descs[I0];
+            b_grid_desc_m_   = descs[I1];
+
+            invariant_lowest_length_ = C;
+
+            int32_t reduceLength =
+                window_spatial_lengths[0] * window_spatial_lengths[1] * window_spatial_lengths[2];
+
+            std::tie(in_element_op_, acc_element_op_) =
+                reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(reduceLength);
+        }
+
+        const InDataType* p_in_dev_;
+        OutDataType* p_out_dev_;
+        IndexDataType* p_out_indices_dev_;
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_M b_grid_desc_m_;
+        InElementwiseOperation in_element_op_;
+        AccElementwiseOperation acc_element_op_;
+
+        // for checking vector load/store
+        ck::index_t invariant_lowest_length_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            using gridwise_reduce =
+                GridwiseReduction_mk_to_m_threadwise<InDataType,
+                                                     OutDataType,
+                                                     ComputeDataType,
+                                                     IndexDataType,
+                                                     AGridDesc_M_K,
+                                                     BGridDesc_M,
+                                                     ReduceOperation,
+                                                     InElementwiseOperation,
+                                                     AccElementwiseOperation,
+                                                     InMemoryDataOperationEnum::Set,
+                                                     false, // propagate_nan
+                                                     BlockSize,
+                                                     MThreadSliceSize,
+                                                     KThreadSliceSize,
+                                                     InSrcOutDstVectorDim,
+                                                     InSrcOutDstVectorSize,
+                                                     InSrcOutDstVectorSize>;
+
+            const auto kernel =
+                kernel_reduce_threadwise<gridwise_reduce,
+                                         OutputIndex,
+                                         true,  // pooling need to return global index
+                                         false, // don't have index input
+                                         InDataType,
+                                         OutDataType,
+                                         ComputeDataType,
+                                         IndexDataType,
+                                         AGridDesc_M_K,
+                                         BGridDesc_M,
+                                         InElementwiseOperation,
+                                         AccElementwiseOperation>;
+
+            ck::index_t M = arg.a_grid_desc_m_k_.GetLength(I0);
+
+            const index_t grid_size = (M / M_BlockTileSize);
+
+            return launch_and_time_kernel(stream_config,
+                                          kernel,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          arg.a_grid_desc_m_k_,
+                                          arg.b_grid_desc_m_,
+                                          arg.in_element_op_,
+                                          arg.acc_element_op_,
+                                          float(1),
+                                          arg.p_in_dev_,
+                                          nullptr,
+                                          float(0),
+                                          arg.p_out_dev_,
+                                          arg.p_out_indices_dev_);
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+
+        if(pArg->invariant_lowest_length_ % InSrcOutDstVectorSize != 0)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in_dev,
+                        void* p_out_dev,
+                        void* p_out_indices_dev,
+                        std::vector<ck::index_t> input_lengths,
+                        std::vector<ck::index_t> window_lengths,
+                        std::vector<ck::index_t> output_lengths,
+                        std::vector<ck::index_t>, // Suppose tensor layout = NDHWC
+                        std::vector<ck::index_t>, // Suppose tensor layout = NDHWC
+                        std::vector<ck::index_t>, // Suppose tensor layout = NDHWC
+                        std::vector<ck::index_t> window_strides,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        std::vector<ck::index_t> pooling_dims) override
+    {
+        if(input_lengths.size() != InOutRank || window_lengths.size() != WindowRank ||
+           input_lengths.size() != InOutRank || window_strides.size() != WindowRank ||
+           input_left_pads.size() != WindowRank || input_right_pads.size() != WindowRank)
+            throw std::runtime_error("dimension is incorrect");
+
+        if(pooling_dims != std::vector<ck::index_t>{2, 3, 4})
+            throw std::runtime_error("pooling_dims only support {2, 3, 4} in pool3d so far");
+
+        index_t N  = input_lengths[0];
+        index_t C  = input_lengths[1];
+        index_t Di = input_lengths[2];
+        index_t Hi = input_lengths[3];
+        index_t Wi = input_lengths[4];
+        index_t Do = output_lengths[2];
+        index_t Ho = output_lengths[3];
+        index_t Wo = output_lengths[4];
+
+        std::vector<ck::index_t> input_spatial_lengths  = {Di, Hi, Wi};
+        std::vector<ck::index_t> output_spatial_lengths = {Do, Ho, Wo};
+
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_dev),
+                                          static_cast<OutDataType*>(p_out_dev),
+                                          static_cast<IndexDataType*>(p_out_indices_dev),
+                                          N,
+                                          C,
+                                          input_spatial_lengths,
+                                          window_lengths,
+                                          output_spatial_lengths,
+                                          window_strides,
+                                          input_left_pads,
+                                          input_right_pads);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C<" << BlockSize << ",";
+        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str <<"InSrcOutDstVectorSize_" << InSrcOutDstVectorSize << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
index a1d976f1a..aa255da64 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
@@ -28,6 +28,7 @@ template <typename InDataType,
           typename AccElementwiseOperation,
           bool PropagateNan,
           bool OutputIndex,
+          bool TransformIndexKtoGlobal,
           bool HaveIndexInputIfOutputIndex,
           index_t BlockSize,
           index_t MThreadSliceSize,
@@ -260,6 +261,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InDataType,
 
             const auto kernel = kernel_reduce_threadwise<GridwiseReduce,
                                                          OutputIndex,
+                                                         TransformIndexKtoGlobal,
                                                          HaveIndexInput,
                                                          InDataType,
                                                          OutDataType,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
index 6c5bd29f9..5986641c6 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
@@ -15,6 +15,7 @@ namespace ck {
 
 template <typename GridwiseReduction,
           bool OutputIndex,
+          bool TransformIndexKtoGlobal,
           bool HaveIndexInput,
           typename InDataType,
           typename OutDataType,
@@ -48,16 +49,17 @@ __global__ void kernel_reduce_threadwise(const InGridDesc_M_K in_grid_desc_m_k,
     }
     else
     {
-        GridwiseReduction::template RunWithIndex<HaveIndexInput>(in_grid_desc_m_k,
-                                                                 out_grid_desc_m,
-                                                                 in_elementwise_op,
-                                                                 acc_elementwise_op,
-                                                                 alpha,
-                                                                 p_in_value_global,
-                                                                 p_in_index_global,
-                                                                 beta,
-                                                                 p_out_value_global,
-                                                                 p_out_index_global);
+        GridwiseReduction::template RunWithIndex<TransformIndexKtoGlobal, HaveIndexInput>(
+            in_grid_desc_m_k,
+            out_grid_desc_m,
+            in_elementwise_op,
+            acc_elementwise_op,
+            alpha,
+            p_in_value_global,
+            p_in_index_global,
+            beta,
+            p_out_value_global,
+            p_out_index_global);
     };
 };
 
@@ -232,7 +234,7 @@ struct GridwiseReduction_mk_to_m_threadwise
             reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, dst_global_buf);
     };
 
-    template <bool HaveIndexInput>
+    template <bool TransformIndexKtoGlobal, bool HaveIndexInput>
     __device__ static void RunWithIndex(const InGridDesc_M_K& in_grid_desc_m_k,
                                         const OutGridDesc_M& out_grid_desc_m,
                                         const InElementwiseOperation& in_elementwise_op,
@@ -390,6 +392,18 @@ struct GridwiseReduction_mk_to_m_threadwise
                 indexStart += KThreadSliceSize;
                 reducedLength += KThreadSliceSize;
             } while(reducedLength < toReduceLength);
+
+            if constexpr(TransformIndexKtoGlobal)
+            {
+                static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                    const auto coord = make_tensor_coordinate(
+                        in_grid_desc_m_k,
+                        make_multi_index(thread_global_1d_id * MThreadSliceSize + I,
+                                         accu_index_buf(I)));
+
+                    accu_index_buf(I) = coord.GetOffset();
+                });
+            }
         };
 
         // for indiced operation, acc_elementwise_op shoud do nothing
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
new file mode 100644
index 000000000..3fc35a83c
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
@@ -0,0 +1,345 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <index_t InOutRank,
+          index_t WindowRank,
+          typename InDataType,
+          typename OutDataType,
+          typename ComputeDataType,
+          typename IndexDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool OutputIndex>
+struct ReferencePoolingFwd : public device::BaseOperator
+{
+    using ReduceOperation = typename ck::reduce_binary_operator<ReduceOpId>::opType;
+
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<InDataType>& in,
+                 Tensor<OutDataType>& out,
+                 Tensor<IndexDataType>& out_indices,
+                 const std::vector<ck::index_t>& window_spatial_lengths,
+                 const std::vector<ck::index_t>& window_strides,
+                 const std::vector<ck::index_t>& in_left_pads,
+                 const std::vector<ck::index_t>& /*in_right_pads*/)
+            : in_(in),
+              out_(out),
+              out_indices_(out_indices),
+              window_spatial_lengths_(window_spatial_lengths),
+              window_strides_(window_strides),
+              in_left_pads_(in_left_pads),
+              reduceLength_(1)
+        {
+            static_for<0, WindowRank, 1>{}(
+                [&](auto I) { reduceLength_ *= window_spatial_lengths[I]; });
+        }
+
+        const Tensor<InDataType>& in_;
+        Tensor<OutDataType>& out_;
+        Tensor<IndexDataType>& out_indices_;
+        const std::vector<ck::index_t>& window_spatial_lengths_;
+        const std::vector<ck::index_t>& window_strides_;
+        const std::vector<ck::index_t>& in_left_pads_;
+        int reduceLength_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        float RunPooling3dFwd(const Argument& arg)
+        {
+
+            auto elementwise_ops =
+                ck::reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
+                    arg.reduceLength_);
+
+            auto in_elementwise_op  = std::get<0>(elementwise_ops);
+            auto acc_elementwise_op = std::get<1>(elementwise_ops);
+
+            if constexpr(!OutputIndex)
+            {
+                using Accumulation = ck::detail::
+                    AccumulateWithNanCheck<PropagateNan, ReduceOperation, ComputeDataType>;
+
+                auto f_ncdhw = [&](auto n, auto c, auto do_, auto ho, auto wo) {
+                    auto accuVal = ReduceOperation::template GetIdentityValue<ComputeDataType>();
+
+                    for(ck::index_t z = 0; z < arg.window_spatial_lengths_[0]; ++z)
+                    {
+                        ck::index_t di = do_ * arg.window_strides_[0] + z - arg.in_left_pads_[0];
+                        for(ck::index_t y = 0; y < arg.window_spatial_lengths_[1]; ++y)
+                        {
+                            ck::index_t hi = ho * arg.window_strides_[1] + y - arg.in_left_pads_[1];
+                            for(ck::index_t x = 0; x < arg.window_spatial_lengths_[2]; ++x)
+                            {
+                                ck::index_t wi =
+                                    wo * arg.window_strides_[2] + x - arg.in_left_pads_[2];
+                                if(di >= 0 &&
+                                   di < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[2]) &&
+                                   hi >= 0 &&
+                                   hi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[3]) &&
+                                   wi >= 0 &&
+                                   wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[4]))
+                                {
+                                    ComputeDataType currVal =
+                                        static_cast<ComputeDataType>(arg.in_(n, c, di, hi, wi));
+
+                                    in_elementwise_op(currVal, currVal);
+
+                                    Accumulation::Calculate(accuVal, currVal);
+                                }
+                            }
+                        }
+                    }
+                    acc_elementwise_op(accuVal, accuVal);
+
+                    arg.out_(n, c, do_, ho, wo) = accuVal;
+                };
+
+                make_ParallelTensorFunctor(f_ncdhw,
+                                           arg.out_.mDesc.GetLengths()[0],
+                                           arg.out_.mDesc.GetLengths()[1],
+                                           arg.out_.mDesc.GetLengths()[2],
+                                           arg.out_.mDesc.GetLengths()[3],
+                                           arg.out_.mDesc.GetLengths()[4])(
+                    std::thread::hardware_concurrency());
+            }
+            else
+            {
+                using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck<PropagateNan,
+                                                                                ReduceOperation,
+                                                                                ComputeDataType,
+                                                                                IndexDataType>;
+
+                auto f_ncdhw = [&](auto n, auto c, auto do_, auto ho, auto wo) {
+                    auto accuVal = ReduceOperation::template GetIdentityValue<ComputeDataType>();
+                    IndexDataType accuIndex = 0;
+
+                    for(ck::index_t z = 0; z < arg.window_spatial_lengths_[0]; ++z)
+                    {
+                        ck::index_t di = do_ * arg.window_strides_[0] + z - arg.in_left_pads_[0];
+                        for(ck::index_t y = 0; y < arg.window_spatial_lengths_[1]; ++y)
+                        {
+                            ck::index_t hi = ho * arg.window_strides_[1] + y - arg.in_left_pads_[1];
+                            for(ck::index_t x = 0; x < arg.window_spatial_lengths_[2]; ++x)
+                            {
+                                ck::index_t wi =
+                                    wo * arg.window_strides_[2] + x - arg.in_left_pads_[2];
+                                if(di >= 0 &&
+                                   di < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[2]) &&
+                                   hi >= 0 &&
+                                   hi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[3]) &&
+                                   wi >= 0 &&
+                                   wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[4]))
+                                {
+                                    ComputeDataType currVal =
+                                        static_cast<ComputeDataType>(arg.in_(n, c, di, hi, wi));
+                                    IndexDataType currIndex =
+                                        arg.in_.GetOffsetFromMultiIndex(n, c, di, hi, wi);
+
+                                    in_elementwise_op(currVal, currVal);
+
+                                    Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
+                                }
+                            }
+                        }
+                    }
+
+                    acc_elementwise_op(accuVal, accuVal);
+
+                    arg.out_(n, c, do_, ho, wo)         = accuVal;
+                    arg.out_indices_(n, c, do_, ho, wo) = accuIndex;
+                };
+
+                make_ParallelTensorFunctor(f_ncdhw,
+                                           arg.out_.mDesc.GetLengths()[0],
+                                           arg.out_.mDesc.GetLengths()[1],
+                                           arg.out_.mDesc.GetLengths()[2],
+                                           arg.out_.mDesc.GetLengths()[3],
+                                           arg.out_.mDesc.GetLengths()[4])(
+                    std::thread::hardware_concurrency());
+            };
+
+            return 0;
+        }
+
+        float RunPooling2dFwd(const Argument& arg)
+        {
+
+            auto elementwise_ops =
+                ck::reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
+                    arg.reduceLength_);
+
+            auto in_elementwise_op  = std::get<0>(elementwise_ops);
+            auto acc_elementwise_op = std::get<1>(elementwise_ops);
+
+            if constexpr(!OutputIndex)
+            {
+                using Accumulation = ck::detail::
+                    AccumulateWithNanCheck<PropagateNan, ReduceOperation, ComputeDataType>;
+
+                auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
+                    auto accuVal = ReduceOperation::template GetIdentityValue<ComputeDataType>();
+
+                    for(ck::index_t y = 0; y < arg.window_spatial_lengths_[0]; ++y)
+                    {
+                        ck::index_t hi = ho * arg.window_strides_[0] + y - arg.in_left_pads_[0];
+                        for(ck::index_t x = 0; x < arg.window_spatial_lengths_[1]; ++x)
+                        {
+                            ck::index_t wi = wo * arg.window_strides_[1] + x - arg.in_left_pads_[1];
+                            if(hi >= 0 &&
+                               hi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[2]) &&
+                               wi >= 0 &&
+                               wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[3]))
+                            {
+                                ComputeDataType currVal =
+                                    static_cast<ComputeDataType>(arg.in_(n, c, hi, wi));
+
+                                in_elementwise_op(currVal, currVal);
+
+                                Accumulation::Calculate(accuVal, currVal);
+                            }
+                        }
+                    }
+
+                    acc_elementwise_op(accuVal, accuVal);
+                    arg.out_(n, c, ho, wo) = accuVal;
+                };
+
+                make_ParallelTensorFunctor(f_nchw,
+                                           arg.out_.mDesc.GetLengths()[0],
+                                           arg.out_.mDesc.GetLengths()[1],
+                                           arg.out_.mDesc.GetLengths()[2],
+                                           arg.out_.mDesc.GetLengths()[3])(
+                    std::thread::hardware_concurrency());
+            }
+            else
+            {
+                using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck<PropagateNan,
+                                                                                ReduceOperation,
+                                                                                ComputeDataType,
+                                                                                IndexDataType>;
+
+                auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
+                    auto accuVal = ReduceOperation::template GetIdentityValue<ComputeDataType>();
+                    IndexDataType accuIndex = 0;
+
+                    for(ck::index_t y = 0; y < arg.window_spatial_lengths_[0]; ++y)
+                    {
+                        ck::index_t hi = ho * arg.window_strides_[0] + y - arg.in_left_pads_[0];
+                        for(ck::index_t x = 0; x < arg.window_spatial_lengths_[1]; ++x)
+                        {
+                            ck::index_t wi = wo * arg.window_strides_[1] + x - arg.in_left_pads_[1];
+                            if(hi >= 0 &&
+                               hi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[2]) &&
+                               wi >= 0 &&
+                               wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[3]))
+                            {
+                                ComputeDataType currVal =
+                                    static_cast<ComputeDataType>(arg.in_(n, c, hi, wi));
+
+                                IndexDataType currIndex =
+                                    arg.in_.GetOffsetFromMultiIndex(n, c, hi, wi);
+
+                                in_elementwise_op(currVal, currVal);
+
+                                Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
+                            }
+                        }
+                    }
+
+                    acc_elementwise_op(accuVal, accuVal);
+                    arg.out_(n, c, ho, wo)         = accuVal;
+                    arg.out_indices_(n, c, ho, wo) = accuIndex;
+                };
+
+                make_ParallelTensorFunctor(f_nchw,
+                                           arg.out_.mDesc.GetLengths()[0],
+                                           arg.out_.mDesc.GetLengths()[1],
+                                           arg.out_.mDesc.GetLengths()[2],
+                                           arg.out_.mDesc.GetLengths()[3])(
+                    std::thread::hardware_concurrency());
+            };
+
+            return 0;
+        }
+
+        float Run(const Argument& arg)
+        {
+            // TODO - support generic pooling
+            if constexpr(InOutRank == 5 && WindowRank == 3)
+                return RunPooling3dFwd(arg);
+            else if constexpr(InOutRank == 4 && WindowRank == 2)
+                return RunPooling2dFwd(arg);
+            else
+                throw std::runtime_error("Only support pooling3d or pooling2d so far");
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<InDataType>& in,
+                             Tensor<OutDataType>& out,
+                             Tensor<IndexDataType>& out_indices,
+                             const std::vector<ck::index_t>& window_spatial_lengths,
+                             const std::vector<ck::index_t>& window_strides,
+                             const std::vector<ck::index_t>& in_left_pads,
+                             const std::vector<ck::index_t>& in_right_pads)
+    {
+        return Argument{in,
+                        out,
+                        out_indices,
+                        window_spatial_lengths,
+                        window_strides,
+                        in_left_pads,
+                        in_right_pads};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferencePoolingFwd"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp b/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
new file mode 100644
index 000000000..44d89cf36
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto InOutRank  = 4;
+static constexpr auto WindowRank = 2;
+
+static constexpr auto MaxOp = ck::ReduceTensorOp::MAX;
+static constexpr auto AvgOp = ck::ReduceTensorOp::AVG;
+
+// FP16
+void add_device_pool2d_fwd_nhwc_f16_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<InOutRank, WindowRank, F16, F16, I32, MaxOp, false>>>&);
+
+void add_device_pool2d_fwd_nhwc_f16_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<InOutRank, WindowRank, F16, F16, I32, AvgOp, false>>>&);
+
+// FP16 - return index
+void add_device_pool2d_fwd_nhwc_index_f16_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<InOutRank, WindowRank, F16, F16, I32, MaxOp, true>>>&);
+
+// FP32
+void add_device_pool2d_fwd_nhwc_f32_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<InOutRank, WindowRank, F32, F32, I32, MaxOp, false>>>&);
+
+void add_device_pool2d_fwd_nhwc_f32_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<InOutRank, WindowRank, F32, F32, I32, AvgOp, false>>>&);
+
+// FP32 - return index
+void add_device_pool2d_fwd_nhwc_index_f32_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<InOutRank, WindowRank, F32, F32, I32, MaxOp, true>>>&);
+
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          bool OutputIndex>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                                                  WindowRank,
+                                                                                  InDataType,
+                                                                                  OutDataType,
+                                                                                  IndexDataType,
+                                                                                  ReduceOpId,
+                                                                                  OutputIndex>>
+{
+    using DeviceOp = DevicePoolFwd<InOutRank,
+                                   WindowRank,
+                                   InDataType,
+                                   OutDataType,
+                                   IndexDataType,
+                                   ReduceOpId,
+                                   OutputIndex>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<InDataType, F16> && is_same_v<OutDataType, F16> &&
+                     is_same_v<IndexDataType, I32>)
+        {
+            if constexpr(OutputIndex && ReduceOpId == MaxOp)
+            {
+                add_device_pool2d_fwd_nhwc_index_f16_instances(op_ptrs);
+            }
+            else
+            {
+                add_device_pool2d_fwd_nhwc_f16_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<InDataType, F32> && is_same_v<OutDataType, F32> &&
+                          is_same_v<IndexDataType, I32>)
+        {
+            if constexpr(OutputIndex && ReduceOpId == MaxOp)
+            {
+                add_device_pool2d_fwd_nhwc_index_f32_instances(op_ptrs);
+            }
+            else
+            {
+                add_device_pool2d_fwd_nhwc_f32_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp b/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
new file mode 100644
index 000000000..88523c703
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto InOutRank  = 5;
+static constexpr auto WindowRank = 3;
+
+static constexpr auto MaxOp = ck::ReduceTensorOp::MAX;
+static constexpr auto AvgOp = ck::ReduceTensorOp::AVG;
+
+// FP16
+void add_device_pool3d_fwd_ndhwc_f16_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<InOutRank, WindowRank, F16, F16, I32, MaxOp, false>>>&);
+
+void add_device_pool3d_fwd_ndhwc_f16_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<InOutRank, WindowRank, F16, F16, I32, AvgOp, false>>>&);
+
+// FP16 - return index
+void add_device_pool3d_fwd_ndhwc_index_f16_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<InOutRank, WindowRank, F16, F16, I32, MaxOp, true>>>&);
+
+// FP32
+void add_device_pool3d_fwd_ndhwc_f32_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<InOutRank, WindowRank, F32, F32, I32, MaxOp, false>>>&);
+
+void add_device_pool3d_fwd_ndhwc_f32_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<InOutRank, WindowRank, F32, F32, I32, AvgOp, false>>>&);
+
+// FP32 - return index
+void add_device_pool3d_fwd_ndhwc_index_f32_instances(
+    std::vector<
+        std::unique_ptr<DevicePoolFwd<InOutRank, WindowRank, F32, F32, I32, MaxOp, true>>>&);
+
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          bool OutputIndex>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                                                  WindowRank,
+                                                                                  InDataType,
+                                                                                  OutDataType,
+                                                                                  IndexDataType,
+                                                                                  ReduceOpId,
+                                                                                  OutputIndex>>
+{
+    using DeviceOp = DevicePoolFwd<InOutRank,
+                                   WindowRank,
+                                   InDataType,
+                                   OutDataType,
+                                   IndexDataType,
+                                   ReduceOpId,
+                                   OutputIndex>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<InDataType, F16> && is_same_v<OutDataType, F16> &&
+                     is_same_v<IndexDataType, I32>)
+        {
+            if constexpr(OutputIndex && ReduceOpId == MaxOp)
+            {
+                add_device_pool3d_fwd_ndhwc_index_f16_instances(op_ptrs);
+            }
+            else
+            {
+                add_device_pool3d_fwd_ndhwc_f16_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<InDataType, F32> && is_same_v<OutDataType, F32> &&
+                          is_same_v<IndexDataType, I32>)
+        {
+            if constexpr(OutputIndex && ReduceOpId == MaxOp)
+            {
+                add_device_pool3d_fwd_ndhwc_index_f32_instances(op_ptrs);
+            }
+            else
+            {
+                add_device_pool3d_fwd_ndhwc_f32_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
index f77c50a8e..325ed1e6d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
@@ -90,6 +90,7 @@ void add_device_reduce_instance_threadwise(
                                                             AccElementwiseOp,
                                                             PropagateNan,
                                                             OutputIndex,
+                                                            false,
                                                             false, // HaveIndexInputIfOutputIndex
                                                             cfg1::BlockSize_,
                                                             cfg2::MThreadSliceSize_,
diff --git a/library/include/ck/library/utility/host_tensor.hpp b/library/include/ck/library/utility/host_tensor.hpp
index 29d94b003..844c29ed1 100644
--- a/library/include/ck/library/utility/host_tensor.hpp
+++ b/library/include/ck/library/utility/host_tensor.hpp
@@ -411,6 +411,12 @@ struct Tensor
         }
     }
 
+    template <typename... Is>
+    std::size_t GetOffsetFromMultiIndex(Is... is) const
+    {
+        return mDesc.GetOffsetFromMultiIndex(is...);
+    }
+
     template <typename... Is>
     T& operator()(Is... is)
     {
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/pool_fwd/CMakeLists.txt
new file mode 100644
index 000000000..0d0f896c8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_instance_library(device_pool_fwd_instance
+    device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+    device_avg_pool2d_fwd_nhwc_f32_instance.cpp
+    device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
+    device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
+    device_max_pool2d_fwd_nhwc_f16_instance.cpp
+    device_max_pool2d_fwd_nhwc_f32_instance.cpp
+    device_max_pool3d_fwd_ndhwc_f16_instance.cpp
+    device_max_pool3d_fwd_ndhwc_f32_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
new file mode 100644
index 000000000..38338ff99
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool2d_fwd_nhwc_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
new file mode 100644
index 000000000..0f4a35dee
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool2d_fwd_nhwc_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
new file mode 100644
index 000000000..6fcb519a9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool3d_fwd_ndhwc_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F16, F16, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F16, F16, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
new file mode 100644
index 000000000..67ffd4708
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+void add_device_pool3d_fwd_ndhwc_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F32, F32, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
new file mode 100644
index 000000000..a41cd0094
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool2d_fwd_nhwc_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F16, ReduceOpId, false>{});
+}
+
+void add_device_pool2d_fwd_nhwc_index_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F16, F16, I32, ReduceOpId, true>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F16, F16, I32, F16, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
new file mode 100644
index 000000000..fa70569ba
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool2d_fwd_nhwc_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
+}
+
+void add_device_pool2d_fwd_nhwc_index_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<4, 2, F32, F32, I32, ReduceOpId, true>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool2d_fwd_nhwc_instances<F32, F32, I32, F32, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
new file mode 100644
index 000000000..f3367b946
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool3d_fwd_ndhwc_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F16, F16, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F16, F16, I32, F16, ReduceOpId, false>{});
+}
+
+void add_device_pool3d_fwd_ndhwc_index_f16_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F16, F16, I32, ReduceOpId, true>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F16, F16, I32, F16, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
new file mode 100644
index 000000000..8477a884d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "pool_fwd_instance_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+void add_device_pool3d_fwd_ndhwc_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F32, F32, I32, ReduceOpId, false>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F32, F32, I32, F32, ReduceOpId, false>{});
+}
+
+void add_device_pool3d_fwd_ndhwc_index_f32_instances(
+    std::vector<std::unique_ptr<DevicePoolFwd<5, 3, F32, F32, I32, ReduceOpId, true>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_pool3d_fwd_ndhwc_instances<F32, F32, I32, F32, ReduceOpId, true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp b/library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp
new file mode 100644
index 000000000..cd508b55b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I32 = int32_t;
+using F16 = ck::half_t;
+using F32 = float;
+
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          typename ComputeDataType,
+          ReduceTensorOp ReduceOpId,
+          bool OutputIndex>
+using device_pool2d_fwd_nhwc_instances =
+    // clang-format off
+    std::tuple <
+        DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 1, 1, 1>,
+        DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 2, 1, 2>,
+        DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 4, 1, 4>
+               // clang-format on
+               >;
+
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          typename ComputeDataType,
+          ReduceTensorOp ReduceOpId,
+          bool OutputIndex>
+using device_pool3d_fwd_ndhwc_instances =
+    // clang-format off
+    std::tuple <
+        DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 1, 1, 1>,
+        DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 2, 1, 2>,
+        DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C<InDataType, OutDataType, IndexDataType, ComputeDataType, ReduceOpId, OutputIndex, 256, 256, 1, 4, 1, 4>
+               // clang-format on
+               >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
new file mode 100644
index 000000000..c313a00be
--- /dev/null
+++ b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename InDataType,
+          typename OutDataType,
+          typename ComputeDataType,
+          typename IndexDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool OutputIndex>
+bool profile_pool2d_fwd_impl(int do_verification,
+                             int init_method,
+                             bool do_log,
+                             bool time_kernel,
+                             std::vector<index_t> in_length, // NCHW
+                             std::vector<index_t> window_spatial_lengths,
+                             std::vector<index_t> window_strides,
+                             std::vector<index_t> input_left_pads,
+                             std::vector<index_t> input_right_pads)
+{
+    constexpr index_t InOutRank  = 4;
+    constexpr index_t WindowRank = 2;
+
+    if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
+       window_strides.size() != WindowRank || input_left_pads.size() != WindowRank ||
+       input_right_pads.size() != WindowRank)
+        return false;
+
+    std::vector<index_t> out_length(InOutRank);
+
+    int N = in_length[0];
+    int C = in_length[1];
+
+    out_length[0] = N;
+    out_length[1] = C;
+
+    // Calculate Ho, Wo
+    for(int i = 2; i < InOutRank; ++i)
+    {
+        auto pad1           = input_left_pads[i - 2];
+        auto pad2           = input_right_pads[i - 2];
+        auto windows_size   = window_spatial_lengths[i - 2];
+        auto windows_stride = window_strides[i - 2];
+        out_length[i]       = (in_length[i] + pad1 + pad2 - windows_size) / windows_stride + 1;
+    }
+
+    int Hi = in_length[2];
+    int Wi = in_length[3];
+    int Ho = out_length[2];
+    int Wo = out_length[3];
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
+            using namespace ck::literals;
+            return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+        };
+
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
+    Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<IndexDataType> out_indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
+
+    Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<IndexDataType> out_indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
+
+    switch(init_method)
+    {
+    case 0: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{}); break;
+    case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
+    default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
+                                     out_indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+
+    // add device normalization instances
+    using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                                 WindowRank,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 IndexDataType,
+                                                                 ReduceOpId,
+                                                                 OutputIndex>;
+
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    if(do_verification)
+    {
+        using ReferenceInstance = ck::tensor_operation::host::ReferencePoolingFwd<InOutRank,
+                                                                                  WindowRank,
+                                                                                  InDataType,
+                                                                                  OutDataType,
+                                                                                  ComputeDataType,
+                                                                                  IndexDataType,
+                                                                                  ReduceOpId,
+                                                                                  PropagateNan,
+                                                                                  OutputIndex>;
+
+        ReferenceInstance ref;
+        auto ref_argument = ref.MakeArgument(in_n_c_hi_wi,
+                                             out_n_c_ho_wo_host,
+                                             out_indices_n_c_ho_wo_host,
+                                             window_spatial_lengths,
+                                             window_strides,
+                                             input_left_pads,
+                                             input_right_pads);
+        auto ref_invoker  = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+    }
+
+    int num_kernel = 0;
+
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            {C * Hi * Wi, 1, Wi * C, C},
+            {C * Ho * Wo, 1, Wo * C, C},
+            {C * Ho * Wo, 1, Wo * C, C},
+            window_strides,
+            input_left_pads,
+            input_right_pads,
+            {2, 3});
+
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "input lengths = ", in_length, ", ") << std::endl;
+            }
+
+            continue;
+        }
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        std::size_t num_bytes = in_n_c_hi_wi.mDesc.GetElementSize() * sizeof(InDataType) +
+                                out_n_c_ho_wo_host.mDesc.GetElementSize() * sizeof(OutDataType);
+
+        if constexpr(OutputIndex)
+            num_bytes += out_indices_n_c_ho_wo_host.mDesc.GetElementSize() * sizeof(IndexDataType);
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
+
+            bool pass = ck::utils::check_err(out_n_c_ho_wo_device.mData,
+                                             out_n_c_ho_wo_host.mData,
+                                             "Error: Incorrect results",
+                                             1e-3,
+                                             1e-3);
+
+            if constexpr(OutputIndex)
+            {
+                out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
+
+                pass = pass && ck::utils::check_err(out_indices_n_c_ho_wo_device,
+                                                    out_indices_n_c_ho_wo_host);
+            }
+
+            if(do_log)
+            {
+                LogRangeAsType<float>(std::cout << "in_n_c_hi_wi  : ", in_n_c_hi_wi.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "out_n_c_ho_wo_host  : ", out_n_c_ho_wo_host.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "out_n_c_ho_wo_device  : ", out_n_c_ho_wo_device.mData, ",")
+                    << std::endl;
+
+                if constexpr(OutputIndex)
+                    LogRangeAsType<float>(std::cout << "out_indices_n_c_ho_wo_device  : ",
+                                          out_indices_n_c_ho_wo_device.mData,
+                                          ",")
+                        << std::endl;
+            }
+
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "lengths = [", in_length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", in_length, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
+
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
new file mode 100644
index 000000000..c9e4c193f
--- /dev/null
+++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename InDataType,
+          typename OutDataType,
+          typename ComputeDataType,
+          typename IndexDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool OutputIndex>
+bool profile_pool3d_fwd_impl(int do_verification,
+                             int init_method,
+                             bool do_log,
+                             bool time_kernel,
+                             std::vector<index_t> in_length, // NCDHW
+                             std::vector<index_t> window_spatial_lengths,
+                             std::vector<index_t> window_strides,
+                             std::vector<index_t> input_left_pads,
+                             std::vector<index_t> input_right_pads)
+{
+    constexpr index_t InOutRank  = 5;
+    constexpr index_t WindowRank = 3;
+
+    if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank ||
+       window_strides.size() != WindowRank || input_left_pads.size() != WindowRank ||
+       input_right_pads.size() != WindowRank)
+        return false;
+
+    std::vector<index_t> out_length(InOutRank);
+
+    int N = in_length[0];
+    int C = in_length[1];
+
+    out_length[0] = N;
+    out_length[1] = C;
+
+    // Calculate Do, Ho, Wo
+    for(int i = 2; i < InOutRank; ++i)
+    {
+        auto pad1           = input_left_pads[i - 2];
+        auto pad2           = input_right_pads[i - 2];
+        auto windows_size   = window_spatial_lengths[i - 2];
+        auto windows_stride = window_strides[i - 2];
+        out_length[i]       = (in_length[i] + pad1 + pad2 - windows_size) / windows_stride + 1;
+    }
+
+    int Di = in_length[2];
+    int Hi = in_length[3];
+    int Wi = in_length[4];
+    int Do = out_length[2];
+    int Ho = out_length[3];
+    int Wo = out_length[4];
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t D, std::size_t H, std::size_t W) {
+            using namespace ck::literals;
+
+            return HostTensorDescriptor({N_, C_, D, H, W},
+                                        {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
+        };
+
+    Tensor<InDataType> in_n_c_di_hi_wi(f_host_tensor_descriptor(N, C, Di, Hi, Wi));
+    Tensor<OutDataType> out_n_c_do_ho_wo_host(f_host_tensor_descriptor(N, C, Do, Ho, Wo));
+    Tensor<IndexDataType> out_indices_n_c_do_ho_wo_host(f_host_tensor_descriptor(N, C, Do, Ho, Wo));
+
+    Tensor<OutDataType> out_n_c_do_ho_wo_device(f_host_tensor_descriptor(N, C, Do, Ho, Wo));
+    Tensor<IndexDataType> out_indices_n_c_do_ho_wo_device(
+        f_host_tensor_descriptor(N, C, Do, Ho, Wo));
+
+    switch(init_method)
+    {
+    case 0: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{}); break;
+    case 1: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
+    default: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_di_hi_wi.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_c_do_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
+                                     out_indices_n_c_do_ho_wo_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in_n_c_di_hi_wi.mData.data());
+
+    // add device normalization instances
+    using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                                 WindowRank,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 IndexDataType,
+                                                                 ReduceOpId,
+                                                                 OutputIndex>;
+
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    if(do_verification)
+    {
+        using ReferenceInstance = ck::tensor_operation::host::ReferencePoolingFwd<InOutRank,
+                                                                                  WindowRank,
+                                                                                  InDataType,
+                                                                                  OutDataType,
+                                                                                  ComputeDataType,
+                                                                                  IndexDataType,
+                                                                                  ReduceOpId,
+                                                                                  PropagateNan,
+                                                                                  OutputIndex>;
+
+        ReferenceInstance ref;
+        auto ref_argument = ref.MakeArgument(in_n_c_di_hi_wi,
+                                             out_n_c_do_ho_wo_host,
+                                             out_indices_n_c_do_ho_wo_host,
+                                             window_spatial_lengths,
+                                             window_strides,
+                                             input_left_pads,
+                                             input_right_pads);
+        auto ref_invoker  = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+    }
+
+    int num_kernel = 0;
+
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C},
+            {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
+            {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
+            window_strides,
+            input_left_pads,
+            input_right_pads,
+            {2, 3, 4});
+
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "input lengths = ", in_length, ", ") << std::endl;
+            }
+
+            continue;
+        }
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        std::size_t num_bytes = in_n_c_di_hi_wi.mDesc.GetElementSize() * sizeof(InDataType) +
+                                out_n_c_do_ho_wo_host.mDesc.GetElementSize() * sizeof(OutDataType);
+
+        if constexpr(OutputIndex)
+            num_bytes +=
+                out_indices_n_c_do_ho_wo_host.mDesc.GetElementSize() * sizeof(IndexDataType);
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            out_device_buf.FromDevice(out_n_c_do_ho_wo_device.mData.data());
+
+            bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
+                                             out_n_c_do_ho_wo_host.mData,
+                                             "Error: Incorrect results",
+                                             1e-3,
+                                             1e-3);
+
+            if constexpr(OutputIndex)
+            {
+                out_indices_device_buf.FromDevice(out_indices_n_c_do_ho_wo_device.mData.data());
+
+                pass = pass && ck::utils::check_err(out_indices_n_c_do_ho_wo_device,
+                                                    out_indices_n_c_do_ho_wo_host);
+            }
+
+            if(do_log)
+            {
+                LogRangeAsType<float>(
+                    std::cout << "in_n_c_di_hi_wi  : ", in_n_c_di_hi_wi.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "out_n_c_do_ho_wo_host  : ", out_n_c_do_ho_wo_host.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "out_n_c_do_ho_wo_device  : ", out_n_c_do_ho_wo_device.mData, ",")
+                    << std::endl;
+
+                if constexpr(OutputIndex)
+                    LogRangeAsType<float>(std::cout << "out_indices_n_c_do_ho_wo_device  : ",
+                                          out_indices_n_c_do_ho_wo_device.mData,
+                                          ",")
+                        << std::endl;
+            }
+
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "lengths = [", in_length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", in_length, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
+
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 0a50eedb7..c9fccc258 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -25,6 +25,8 @@ set(PROFILER_SOURCES
     profile_reduce.cpp
     profile_groupnorm.cpp
     profile_layernorm.cpp
+    profile_avg_pool2d_fwd.cpp
+    profile_max_pool3d_fwd.cpp
     profile_softmax.cpp
     profile_batchnorm_fwd.cpp
     profile_batchnorm_bwd.cpp
@@ -74,4 +76,6 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool_fwd_instance)
+
 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
diff --git a/profiler/src/profile_avg_pool2d_fwd.cpp b/profiler/src/profile_avg_pool2d_fwd.cpp
new file mode 100644
index 000000000..b92288096
--- /dev/null
+++ b/profiler/src/profile_avg_pool2d_fwd.cpp
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+
+#include "profiler/data_type_enum.hpp"
+#include "profiler/profile_pool2d_fwd_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+using ck::index_t;
+
+struct avgPoolFwdArgParser
+{
+    std::unordered_map<std::string, std::vector<int>> long_opts = {
+        {"length", {}}, {"wsize", {}}, {"wstride", {}}, {"pad1", {}}, {"pad2", {}}};
+
+    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
+    {
+        if(std::string("--") + key == argv[i])
+        {
+            int pos = i;
+            while(++i < argc && argv[i][0] != '-') {}
+            int end = i;
+            for(int j = pos + 1; j < end; j++)
+            {
+                long_opts[key].push_back(std::stoi(argv[j]));
+            }
+            return true;
+        }
+        return false;
+    }
+
+    void operator()(int argc, char* argv[])
+    {
+        for(auto& kv : long_opts)
+        {
+            for(int i = 1; i < argc; i++)
+            {
+                if(parse_opt(argc, argv, kv.first, i))
+                    break;
+            }
+        }
+    }
+};
+
+void print_help_avg_pool2d_fwd()
+{
+    std::cout << "arg1: data type (0: fp16; 1: fp32)\n"
+              << "arg2: verification (0: no; 1: yes)\n"
+              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg4: print tensor value (0: no; 1: yes)\n"
+              << "arg5: time kernel (0=no, 1=yes)\n"
+              << "--length: input tensor length for NDHW(e.g, --length 2 32 30 30) \n"
+              << "--wsize: window size for YX (e.g, --wsize 2 2) \n"
+              << "--wstride: window stride for HW (e.g, --wstride 2 2) \n"
+              << "--pad1: left side of padding in HW (e.g, --pad1 1 1) \n"
+              << "--pad2: right side of padding in HW (e.g, --pad2 1 1) \n"
+              << "eg: ckProfiler avg_pool2d_fwd 0 1 2 0 1 0 --length 2 32 30 30 --wsize 2 2 "
+                 "--wstride 2 2 --pad1 1 1 --pad2 1 1"
+              << std::endl;
+}
+
+int profile_avg_pool2d_fwd(int argc, char* argv[])
+{
+    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
+    bool do_verification       = true;
+    int init_method            = 0;
+    bool do_log                = false;
+    bool time_kernel           = true;
+
+    std::vector<index_t> in_length = {2, 32, 30, 30};
+    std::vector<index_t> wsize     = {2, 2};
+    std::vector<index_t> wstride   = {2, 2};
+    std::vector<index_t> pad1      = {1, 1};
+    std::vector<index_t> pad2      = {1, 1};
+
+    if(argc != 2 && argc != 25)
+    {
+        print_help_avg_pool2d_fwd();
+        return 0;
+    }
+    else if(argc == 25)
+    {
+        data_type       = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
+        do_verification = std::stoi(argv[3]);
+        init_method     = std::stoi(argv[4]);
+        do_log          = std::stoi(argv[5]);
+        time_kernel     = std::stoi(argv[6]);
+
+        // parse the long options
+        avgPoolFwdArgParser arg_parser;
+        arg_parser(argc, argv);
+        in_length = arg_parser.long_opts["length"];
+        wsize     = arg_parser.long_opts["wsize"];
+        wstride   = arg_parser.long_opts["wstride"];
+        pad1      = arg_parser.long_opts["pad1"];
+        pad2      = arg_parser.long_opts["pad2"];
+    }
+
+    using F16                 = ck::half_t;
+    using F32                 = float;
+    using I32                 = int32_t;
+    constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+
+    if(data_type == ck::DataTypeEnum::Half)
+    {
+        ck::profiler::profile_pool2d_fwd_impl<F16, F16, F32, I32, ReduceOpId, false, false>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            in_length,
+            wsize,
+            wstride,
+            pad1,
+            pad2);
+    }
+    else if(data_type == ck::DataTypeEnum::Float)
+    {
+        ck::profiler::profile_pool2d_fwd_impl<F32, F32, F32, I32, ReduceOpId, false, false>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            in_length,
+            wsize,
+            wstride,
+            pad1,
+            pad2);
+    }
+    else
+    {
+        throw std::runtime_error("not implemented yet");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION("avg_pool2d_fwd", "avg_pool2d fwd", profile_avg_pool2d_fwd);
diff --git a/profiler/src/profile_groupnorm.cpp b/profiler/src/profile_groupnorm.cpp
index 2741f5271..d55529a0f 100644
--- a/profiler/src/profile_groupnorm.cpp
+++ b/profiler/src/profile_groupnorm.cpp
@@ -64,7 +64,7 @@ int profile_groupnorm(int argc, char* argv[])
     ck::DataTypeEnum data_type  = ck::DataTypeEnum::Half;
     bool do_verification        = false;
     int init_method             = 0;
-    bool do_log                 = 0;
+    bool do_log                 = false;
     bool time_kernel            = 1;
     std::vector<index_t> length = {64, 16, 16, 32, 40};
 
diff --git a/profiler/src/profile_max_pool3d_fwd.cpp b/profiler/src/profile_max_pool3d_fwd.cpp
new file mode 100644
index 000000000..90c6e4e2b
--- /dev/null
+++ b/profiler/src/profile_max_pool3d_fwd.cpp
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+
+#include "profiler/data_type_enum.hpp"
+#include "profiler/profile_pool3d_fwd_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+using ck::index_t;
+
+struct maxPoolFwdArgParser
+{
+    std::unordered_map<std::string, std::vector<int>> long_opts = {
+        {"length", {}}, {"wsize", {}}, {"wstride", {}}, {"pad1", {}}, {"pad2", {}}};
+
+    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
+    {
+        if(std::string("--") + key == argv[i])
+        {
+            int pos = i;
+            while(++i < argc && argv[i][0] != '-') {}
+            int end = i;
+            for(int j = pos + 1; j < end; j++)
+            {
+                long_opts[key].push_back(std::stoi(argv[j]));
+            }
+            return true;
+        }
+        return false;
+    }
+
+    void operator()(int argc, char* argv[])
+    {
+        for(auto& kv : long_opts)
+        {
+            for(int i = 1; i < argc; i++)
+            {
+                if(parse_opt(argc, argv, kv.first, i))
+                    break;
+            }
+        }
+    }
+};
+
+void print_help_max_pool3d_fwd()
+{
+    std::cout << "arg1: data type (0: fp16; 1: fp32)\n"
+              << "arg2: verification (0: no; 1: yes)\n"
+              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg4: print tensor value (0: no; 1: yes)\n"
+              << "arg5: time kernel (0=no, 1=yes)\n"
+              << "arg6: return index (0=no, 1=yes)\n"
+              << "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
+              << "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
+              << "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
+              << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
+              << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
+              << "eg: ckProfiler max_pool3d_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 "
+                 "--wstride 2 2 2 --pad1 1 1 1 --pad2 1 1 1"
+              << std::endl;
+}
+
+int profile_max_pool3d_fwd(int argc, char* argv[])
+{
+    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
+    bool do_verification       = true;
+    int init_method            = 0;
+    bool do_log                = false;
+    bool time_kernel           = true;
+    bool return_index          = false;
+
+    std::vector<index_t> in_length = {2, 32, 30, 30, 30};
+    std::vector<index_t> wsize     = {2, 2, 2};
+    std::vector<index_t> wstride   = {2, 2, 2};
+    std::vector<index_t> pad1      = {1, 1, 1};
+    std::vector<index_t> pad2      = {1, 1, 1};
+
+    if(argc != 2 && argc != 30)
+    {
+        print_help_max_pool3d_fwd();
+        return 0;
+    }
+    else if(argc == 30)
+    {
+        data_type       = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
+        do_verification = std::stoi(argv[3]);
+        init_method     = std::stoi(argv[4]);
+        do_log          = std::stoi(argv[5]);
+        time_kernel     = std::stoi(argv[6]);
+        return_index    = std::stoi(argv[7]);
+
+        // parse the long options
+        maxPoolFwdArgParser arg_parser;
+        arg_parser(argc, argv);
+        in_length = arg_parser.long_opts["length"];
+        wsize     = arg_parser.long_opts["wsize"];
+        wstride   = arg_parser.long_opts["wstride"];
+        pad1      = arg_parser.long_opts["pad1"];
+        pad2      = arg_parser.long_opts["pad2"];
+    }
+
+    using F16                 = ck::half_t;
+    using F32                 = float;
+    using I32                 = int32_t;
+    constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+
+    if(data_type == ck::DataTypeEnum::Half)
+    {
+        if(return_index)
+            ck::profiler::profile_pool3d_fwd_impl<F16, F16, F16, I32, ReduceOpId, false, true>(
+                do_verification,
+                init_method,
+                do_log,
+                time_kernel,
+                in_length,
+                wsize,
+                wstride,
+                pad1,
+                pad2);
+        else
+            ck::profiler::profile_pool3d_fwd_impl<F16, F16, F16, I32, ReduceOpId, false, false>(
+                do_verification,
+                init_method,
+                do_log,
+                time_kernel,
+                in_length,
+                wsize,
+                wstride,
+                pad1,
+                pad2);
+    }
+    else if(data_type == ck::DataTypeEnum::Float)
+    {
+        if(return_index)
+            ck::profiler::profile_pool3d_fwd_impl<F32, F32, F32, I32, ReduceOpId, false, true>(
+                do_verification,
+                init_method,
+                do_log,
+                time_kernel,
+                in_length,
+                wsize,
+                wstride,
+                pad1,
+                pad2);
+        else
+            ck::profiler::profile_pool3d_fwd_impl<F32, F32, F32, I32, ReduceOpId, false, false>(
+                do_verification,
+                init_method,
+                do_log,
+                time_kernel,
+                in_length,
+                wsize,
+                wstride,
+                pad1,
+                pad2);
+    }
+    else
+    {
+        throw std::runtime_error("not implemented yet");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION("max_pool3d_fwd", "max_pool3d fwd", profile_max_pool3d_fwd);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4f212d53a..dad9b53ce 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -57,6 +57,7 @@ add_subdirectory(data_type)
 add_subdirectory(elementwise_normalization)
 add_subdirectory(batchnorm)
 add_subdirectory(contraction)
+add_subdirectory(pool_fwd)
 if(GPU_TARGETS MATCHES "gfx1100")
     add_subdirectory(wmma_op)
 endif()
diff --git a/test/pool_fwd/CMakeLists.txt b/test/pool_fwd/CMakeLists.txt
new file mode 100644
index 000000000..6f59b95f6
--- /dev/null
+++ b/test/pool_fwd/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_custom_target(test_pool_fwd)
+
+add_gtest_executable(test_avg_pool2d_fwd test_avg_pool2d_fwd.cpp)
+add_gtest_executable(test_avg_pool3d_fwd test_avg_pool3d_fwd.cpp)
+add_gtest_executable(test_max_pool2d_fwd test_max_pool2d_fwd.cpp)
+add_gtest_executable(test_max_pool3d_fwd test_max_pool3d_fwd.cpp)
+
+target_link_libraries(test_avg_pool2d_fwd PRIVATE utility device_pool_fwd_instance)
+target_link_libraries(test_avg_pool3d_fwd PRIVATE utility device_pool_fwd_instance)
+target_link_libraries(test_max_pool2d_fwd PRIVATE utility device_pool_fwd_instance)
+target_link_libraries(test_max_pool3d_fwd PRIVATE utility device_pool_fwd_instance)
+
+add_dependencies(test_pool_fwd test_avg_pool2d_fwd)
+add_dependencies(test_pool_fwd test_avg_pool3d_fwd)
+add_dependencies(test_pool_fwd test_max_pool2d_fwd)
+add_dependencies(test_pool_fwd test_max_pool3d_fwd)
diff --git a/test/pool_fwd/test_avg_pool2d_fwd.cpp b/test/pool_fwd/test_avg_pool2d_fwd.cpp
new file mode 100644
index 000000000..4e5f1e0e9
--- /dev/null
+++ b/test/pool_fwd/test_avg_pool2d_fwd.cpp
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "profiler/profile_pool2d_fwd_impl.hpp"
+#include "test_pool_fwd_common.hpp"
+
+template <typename Tuple>
+class TestAvgPool2dFwd : public ::testing::Test
+{
+    protected:
+    using InDataType      = std::tuple_element_t<0, Tuple>;
+    using OutDataType     = std::tuple_element_t<1, Tuple>;
+    using ComputeDataType = std::tuple_element_t<2, Tuple>;
+    using IndexDataType   = std::tuple_element_t<3, Tuple>;
+
+    std::vector<PoolingParam> params;
+
+    void Run()
+    {
+        for(auto param : params)
+        {
+            bool success =
+                ck::profiler::profile_pool2d_fwd_impl<InDataType,
+                                                      OutDataType,
+                                                      ComputeDataType,
+                                                      IndexDataType,
+                                                      ck::ReduceTensorOp::AVG,
+                                                      false,
+                                                      false>(true,
+                                                             2,
+                                                             false,
+                                                             false,
+                                                             param.length_,
+                                                             param.window_spatial_lengths_,
+                                                             param.window_strides_,
+                                                             param.input_left_pads_,
+                                                             param.input_right_pads_);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+
+using KernelTypes =
+    ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
+
+TYPED_TEST_SUITE(TestAvgPool2dFwd, KernelTypes);
+TYPED_TEST(TestAvgPool2dFwd, Test_Pool)
+{
+    // length, window_length, window_stride, left_pad, right_pad
+    this->params = {{{1, 1, 1, 1}, {1, 1}, {1, 1}, {0, 0}, {0, 0}},
+                    {{2, 16, 64, 64}, {64, 64}, {1, 1}, {0, 0}, {0, 0}},
+                    {{2, 32, 30, 30}, {2, 2}, {2, 2}, {1, 1}, {1, 1}}};
+
+    this->Run();
+}
diff --git a/test/pool_fwd/test_avg_pool3d_fwd.cpp b/test/pool_fwd/test_avg_pool3d_fwd.cpp
new file mode 100644
index 000000000..0d6b105b1
--- /dev/null
+++ b/test/pool_fwd/test_avg_pool3d_fwd.cpp
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "profiler/profile_pool3d_fwd_impl.hpp"
+#include "test_pool_fwd_common.hpp"
+
+template <typename Tuple>
+class TestAvgPool3dFwd : public ::testing::Test
+{
+    protected:
+    using InDataType      = std::tuple_element_t<0, Tuple>;
+    using OutDataType     = std::tuple_element_t<1, Tuple>;
+    using ComputeDataType = std::tuple_element_t<2, Tuple>;
+    using IndexDataType   = std::tuple_element_t<3, Tuple>;
+
+    std::vector<PoolingParam> params;
+
+    void Run()
+    {
+        for(auto param : params)
+        {
+            bool success =
+                ck::profiler::profile_pool3d_fwd_impl<InDataType,
+                                                      OutDataType,
+                                                      ComputeDataType,
+                                                      IndexDataType,
+                                                      ck::ReduceTensorOp::AVG,
+                                                      false,
+                                                      false>(true,
+                                                             2,
+                                                             false,
+                                                             false,
+                                                             param.length_,
+                                                             param.window_spatial_lengths_,
+                                                             param.window_strides_,
+                                                             param.input_left_pads_,
+                                                             param.input_right_pads_);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+
+using KernelTypes =
+    ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
+
+TYPED_TEST_SUITE(TestAvgPool3dFwd, KernelTypes);
+TYPED_TEST(TestAvgPool3dFwd, Test_Pool)
+{
+    // length, window_length, window_stride, left_pad, right_pad
+    this->params = {{{1, 1, 1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
+                    {{2, 16, 64, 64, 64}, {64, 64, 64}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
+                    {{2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}}};
+
+    this->Run();
+}
diff --git a/test/pool_fwd/test_max_pool2d_fwd.cpp b/test/pool_fwd/test_max_pool2d_fwd.cpp
new file mode 100644
index 000000000..d16ac7fab
--- /dev/null
+++ b/test/pool_fwd/test_max_pool2d_fwd.cpp
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "profiler/profile_pool2d_fwd_impl.hpp"
+#include "test_pool_fwd_common.hpp"
+
+template <typename Tuple>
+class TestMaxPool2dFwd : public ::testing::Test
+{
+    protected:
+    using InDataType      = std::tuple_element_t<0, Tuple>;
+    using OutDataType     = std::tuple_element_t<1, Tuple>;
+    using ComputeDataType = std::tuple_element_t<2, Tuple>;
+    using IndexDataType   = std::tuple_element_t<3, Tuple>;
+
+    std::vector<PoolingParam> params;
+
+    void Run()
+    {
+        for(auto param : params)
+        {
+            // max pool
+            bool success =
+                ck::profiler::profile_pool2d_fwd_impl<InDataType,
+                                                      OutDataType,
+                                                      ComputeDataType,
+                                                      IndexDataType,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      false>(true,
+                                                             2,
+                                                             false,
+                                                             false,
+                                                             param.length_,
+                                                             param.window_spatial_lengths_,
+                                                             param.window_strides_,
+                                                             param.input_left_pads_,
+                                                             param.input_right_pads_);
+            EXPECT_TRUE(success);
+
+            // max pool + index
+            success = ck::profiler::profile_pool2d_fwd_impl<InDataType,
+                                                            OutDataType,
+                                                            ComputeDataType,
+                                                            IndexDataType,
+                                                            ck::ReduceTensorOp::MAX,
+                                                            false,
+                                                            true>(true,
+                                                                  2,
+                                                                  false,
+                                                                  false,
+                                                                  param.length_,
+                                                                  param.window_spatial_lengths_,
+                                                                  param.window_strides_,
+                                                                  param.input_left_pads_,
+                                                                  param.input_right_pads_);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+
+using KernelTypes =
+    ::testing::Types<std::tuple<F16, F16, F16, I32>, std::tuple<F32, F32, F32, I32>>;
+
+TYPED_TEST_SUITE(TestMaxPool2dFwd, KernelTypes);
+TYPED_TEST(TestMaxPool2dFwd, Test_Pool)
+{
+    // length, window_length, window_stride, left_pad, right_pad
+    this->params = {{{1, 1, 1, 1}, {1, 1}, {1, 1}, {0, 0}, {0, 0}},
+                    {{2, 16, 64, 64}, {64, 64}, {1, 1}, {0, 0}, {0, 0}},
+                    {{2, 32, 30, 30}, {2, 2}, {2, 2}, {1, 1}, {1, 1}}};
+
+    this->Run();
+}
diff --git a/test/pool_fwd/test_max_pool3d_fwd.cpp b/test/pool_fwd/test_max_pool3d_fwd.cpp
new file mode 100644
index 000000000..f084dd9cb
--- /dev/null
+++ b/test/pool_fwd/test_max_pool3d_fwd.cpp
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "profiler/profile_pool3d_fwd_impl.hpp"
+#include "test_pool_fwd_common.hpp"
+
+template <typename Tuple>
+class TestMaxPool3dFwd : public ::testing::Test
+{
+    protected:
+    using InDataType      = std::tuple_element_t<0, Tuple>;
+    using OutDataType     = std::tuple_element_t<1, Tuple>;
+    using ComputeDataType = std::tuple_element_t<2, Tuple>;
+    using IndexDataType   = std::tuple_element_t<3, Tuple>;
+
+    std::vector<PoolingParam> params;
+
+    void Run()
+    {
+        for(auto param : params)
+        {
+            // max pool
+            bool success =
+                ck::profiler::profile_pool3d_fwd_impl<InDataType,
+                                                      OutDataType,
+                                                      ComputeDataType,
+                                                      IndexDataType,
+                                                      ck::ReduceTensorOp::MAX,
+                                                      false,
+                                                      false>(true,
+                                                             2,
+                                                             false,
+                                                             false,
+                                                             param.length_,
+                                                             param.window_spatial_lengths_,
+                                                             param.window_strides_,
+                                                             param.input_left_pads_,
+                                                             param.input_right_pads_);
+            EXPECT_TRUE(success);
+
+            // max pool + index
+            success = ck::profiler::profile_pool3d_fwd_impl<InDataType,
+                                                            OutDataType,
+                                                            ComputeDataType,
+                                                            IndexDataType,
+                                                            ck::ReduceTensorOp::MAX,
+                                                            false,
+                                                            true>(true,
+                                                                  2,
+                                                                  false,
+                                                                  false,
+                                                                  param.length_,
+                                                                  param.window_spatial_lengths_,
+                                                                  param.window_strides_,
+                                                                  param.input_left_pads_,
+                                                                  param.input_right_pads_);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+
+using KernelTypes =
+    ::testing::Types<std::tuple<F16, F16, F16, I32>, std::tuple<F32, F32, F32, I32>>;
+
+TYPED_TEST_SUITE(TestMaxPool3dFwd, KernelTypes);
+TYPED_TEST(TestMaxPool3dFwd, Test_Pool)
+{
+    // length, window_length, window_stride, left_pad, right_pad
+    this->params = {{{1, 1, 1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
+                    {{2, 16, 64, 64, 64}, {64, 64, 64}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}},
+                    {{2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}}};
+
+    this->Run();
+}
diff --git a/test/pool_fwd/test_pool_fwd_common.hpp b/test/pool_fwd/test_pool_fwd_common.hpp
new file mode 100644
index 000000000..a78785171
--- /dev/null
+++ b/test/pool_fwd/test_pool_fwd_common.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/ck.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+using I32 = int32_t;
+using ck::index_t;
+
+struct PoolingParam
+{
+    PoolingParam(const std::vector<index_t>& length,
+                 const std::vector<index_t>& window_spatial_lengths,
+                 const std::vector<index_t>& window_strides,
+                 const std::vector<index_t>& input_left_pads,
+                 const std::vector<index_t>& input_right_pads)
+        : length_(length),
+          window_spatial_lengths_(window_spatial_lengths),
+          window_strides_(window_strides),
+          input_left_pads_(input_left_pads),
+          input_right_pads_(input_right_pads)
+    {
+    }
+    std::vector<index_t> length_;
+    std::vector<index_t> window_spatial_lengths_;
+    std::vector<index_t> window_strides_;
+    std::vector<index_t> input_left_pads_;
+    std::vector<index_t> input_right_pads_;
+};
-- 
GitLab


From ac9e01e2cc3721be24619807adc444e1f59a9d25 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 24 May 2023 08:11:25 -0700
Subject: [PATCH 42/71] Clean-up the headers (#713)

* fix headers for gpu instances

* remove unused headers

---------

Co-authored-by: zjing14 <zhangjing14@gmail.com>
---
 ...volution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp |  275 ---
 ...lution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp |  355 ----
 ...into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw.hpp |  150 --
 ...lution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp |  132 --
 ...into_gemm_v4r4r4_atomic_nhwc_kyxc_nhwk.hpp |  150 --
 ...lution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp |  135 --
 ...lution_into_gemm_v4r4r5_nhwc_kyxc_nhwk.hpp |  147 --
 ...volution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp |  260 ---
 ...volution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp |  179 --
 ...lution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp |  132 --
 ...lution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp |  132 --
 ...lution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp |  134 --
 ...volution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp |  135 --
 .../impl/device_gemm_bias_e_permute_xdl.hpp   |  586 ------
 .../grid/gridwise_contraction_dlops_v1r2.hpp  |  662 -------
 .../gpu/grid/gridwise_gemm_dlops_v1r2.hpp     |  608 -------
 .../gpu/grid/gridwise_gemm_dlops_v2.hpp       |  461 -----
 .../gpu/grid/gridwise_gemm_dlops_v3.hpp       | 1597 -----------------
 .../threadwise_tensor_slice_transfer_v3r3.hpp |  886 ---------
 include/ck/utility/amd_llvm_intrinsic.hpp     |   14 -
 include/ck/utility/print.hpp                  |   25 -
 .../cpu/reference_gemm_bias_2d.hpp            |  136 --
 .../cpu/reference_gemm_bias_activation.hpp    |  140 --
 .../reference_gemm_bias_activation_add.hpp    |  148 --
 .../gpu/batched_gemm.hpp                      |    4 +-
 .../gpu/batched_gemm_add_relu_gemm_add.hpp    |    4 +-
 ...batched_gemm_bias_softmax_gemm_permute.hpp |    4 +-
 .../gpu/batched_gemm_gemm.hpp                 |    4 +-
 .../gpu/batched_gemm_softmax_gemm_permute.hpp |    4 +-
 .../gpu/contraction_bilinear.hpp              |    2 -
 .../gpu/contraction_scale.hpp                 |    2 -
 .../gpu/convolution_backward_data.hpp         |    4 +-
 .../gpu/convolution_forward.hpp               |    4 +-
 .../gpu/device_elementwise_instance.hpp       |    3 +-
 .../device_gemm_mean_squaremean_instance.hpp  |    2 +-
 .../tensor_operation_instance/gpu/gemm.hpp    |    2 -
 .../gpu/gemm_add_add_fastgelu.hpp             |    2 -
 .../gpu/gemm_bilinear.hpp                     |    2 -
 .../gpu/gemm_splitk.hpp                       |    4 +-
 .../gpu/grouped_convolution_forward.hpp       |    2 +-
 .../gpu/grouped_gemm.hpp                      |    4 +-
 .../gpu/normalization.hpp                     |    4 +-
 .../include/ck/library/utility/host_conv.hpp  |  152 --
 .../ck/library/utility/op_instance_engine.hpp |  249 ---
 .../profiler/data_type_enum_helper.hpp        |   77 -
 .../profiler/profile_convnd_bwd_data_impl.hpp |  486 -----
 .../profile_convnd_bwd_weight_impl.hpp        |  474 -----
 47 files changed, 23 insertions(+), 9051 deletions(-)
 delete mode 100644 include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
 delete mode 100644 include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
 delete mode 100644 include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw.hpp
 delete mode 100644 include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
 delete mode 100644 include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_atomic_nhwc_kyxc_nhwk.hpp
 delete mode 100644 include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
 delete mode 100644 include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r5_nhwc_kyxc_nhwk.hpp
 delete mode 100644 include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
 delete mode 100644 include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
 delete mode 100644 include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
 delete mode 100644 include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
 delete mode 100644 include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
 delete mode 100644 include/ck/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
 delete mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_e_permute_xdl.hpp
 delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
 delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
 delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp
 delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp
 delete mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
 delete mode 100644 include/ck/utility/amd_llvm_intrinsic.hpp
 delete mode 100644 include/ck/utility/print.hpp
 delete mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
 delete mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
 delete mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
 delete mode 100644 library/include/ck/library/utility/host_conv.hpp
 delete mode 100644 library/include/ck/library/utility/op_instance_engine.hpp
 delete mode 100644 profiler/include/profiler/data_type_enum_helper.hpp
 delete mode 100644 profiler/include/profiler/profile_convnd_bwd_data_impl.hpp
 delete mode 100644 profiler/include/profiler/profile_convnd_bwd_weight_impl.hpp

diff --git a/include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
deleted file mode 100644
index db8e48df6..000000000
--- a/include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
+++ /dev/null
@@ -1,275 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
-#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// Number of GEMMs = YTilde * XTilde
-// GemmM = C
-// GemmN = N * HTildeSlice * WTildeSlice
-// GemmK = K * YDotSlice * XDotSlice
-template <typename... Wei,
-          typename... In,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads,
-          index_t IYTildeValue,
-          index_t IXTildeValue,
-          index_t GemmK1Value>
-__host__ __device__ constexpr auto
-transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
-    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
-    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    Number<IYTildeValue>,
-    Number<IXTildeValue>,
-    Number<GemmK1Value>)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto GemmK1  = Number<GemmK1Value>{};
-    constexpr auto IYTilde = Number<IYTildeValue>{};
-    constexpr auto IXTilde = Number<IXTildeValue>{};
-
-    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
-    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
-    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
-
-    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
-    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
-
-    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
-    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
-
-    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
-    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
-    const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
-
-    const auto YTilde = ConvStrideH / GcdStrideDilationH;
-    const auto XTilde = ConvStrideW / GcdStrideDilationW;
-
-    const auto YDot = math::integer_divide_ceil(Y, YTilde);
-    const auto XDot = math::integer_divide_ceil(X, XTilde);
-
-    const auto HTilde = Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
-    const auto WTilde = Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
-
-    // only work on HTilde and WTilde that contribute to non-padding area of input tensor
-    const auto IHTildeSliceBegin = math::integer_divide_floor(
-        math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
-    const auto IWTildeSliceBegin = math::integer_divide_floor(
-        math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
-
-    const auto IHTildeSliceEnd =
-        math::min(HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
-    const auto IWTildeSliceEnd =
-        math::min(WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
-
-    const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
-    const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
-
-    // GemmK is different for each GEMM
-    const auto YDotSlice = math::integer_divide_ceil(Y - IYTilde, YTilde);
-    const auto XDotSlice = math::integer_divide_ceil(X - IXTilde, XTilde);
-
-    const auto K1 = GemmK1;
-    const auto K0 = K / K1;
-
-    // weight tensor
-    const auto wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor(
-        wei_k_y_x_c_grid_desc,
-        make_tuple(make_pass_through_transform(K),
-                   make_embed_transform(make_tuple(YDot, YTilde),
-                                        make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
-                   make_embed_transform(make_tuple(XDot, XTilde),
-                                        make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
-
-    const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
-        transform_tensor_descriptor(wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
-                                               make_slice_transform(YDot, I0, YDotSlice),
-                                               make_slice_transform(XDot, I0, XDotSlice),
-                                               make_freeze_transform(IYTilde),
-                                               make_freeze_transform(IXTilde),
-                                               make_pass_through_transform(C)),
-                                    make_tuple(Sequence<0>{},
-                                               Sequence<1>{},
-                                               Sequence<3>{},
-                                               Sequence<2>{},
-                                               Sequence<4>{},
-                                               Sequence<5>{}),
-                                    make_tuple(Sequence<0, 1>{},
-                                               Sequence<2>{},
-                                               Sequence<3>{},
-                                               Sequence<>{},
-                                               Sequence<>{},
-                                               Sequence<4>{}));
-
-#if 1
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
-        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
-        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
-                   make_pass_through_transform(C),
-                   make_pass_through_transform(K1)),
-        make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-#else
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
-        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
-        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
-                   make_pass_through_transform(C),
-                   make_pass_through_transform(K1)),
-        make_tuple(Sequence<0, 2, 3>{}, Sequence<4>{}, Sequence<1>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-#endif
-
-    // output tensor
-    // this add padding check
-    const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
-        out_n_ho_wo_k_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pad_transform(Ho, I0, I0),
-                   make_pad_transform(Wo, I0, I0),
-                   make_pass_through_transform(K)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const auto out_n_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor(
-        out_n_hop_wop_k_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_embed_transform(make_tuple(YDot, HTilde),
-                                        make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
-                   make_embed_transform(make_tuple(XDot, WTilde),
-                                        make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
-                   make_pass_through_transform(K)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
-
-    const auto out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc =
-        transform_tensor_descriptor(
-            out_n_ydot_htilde_xdot_wtilde_k_grid_desc,
-            make_tuple(make_pass_through_transform(N),
-                       make_slice_transform(YDot, I0, YDotSlice),
-                       make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
-                       make_slice_transform(XDot, I0, XDotSlice),
-                       make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
-                       make_unmerge_transform(make_tuple(K0, K1))),
-            make_tuple(Sequence<0>{},
-                       Sequence<1>{},
-                       Sequence<2>{},
-                       Sequence<3>{},
-                       Sequence<4>{},
-                       Sequence<5>{}),
-            make_tuple(Sequence<0>{},
-                       Sequence<1>{},
-                       Sequence<2>{},
-                       Sequence<3>{},
-                       Sequence<4>{},
-                       Sequence<5, 6>{}));
-
-#if 1
-    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
-        out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc,
-        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
-                   make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
-                   make_pass_through_transform(K1)),
-        make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-#else
-    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
-        out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc,
-        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
-                   make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
-                   make_pass_through_transform(K1)),
-        make_tuple(Sequence<5, 1, 3>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-#endif
-
-    // input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
-        in_n_hi_wi_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const auto in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor(
-        in_n_hip_wip_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_embed_transform(make_tuple(YTilde, HTilde),
-                                        make_tuple(ConvDilationH, ConvStrideH)),
-                   make_embed_transform(make_tuple(XTilde, WTilde),
-                                        make_tuple(ConvDilationW, ConvStrideW)),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
-
-    const auto in_n_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor(
-        in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_freeze_transform(IYTilde),
-                   make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
-                   make_freeze_transform(IXTilde),
-                   make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{},
-                   Sequence<1>{},
-                   Sequence<2>{},
-                   Sequence<3>{},
-                   Sequence<4>{},
-                   Sequence<5>{}),
-        make_tuple(Sequence<0>{},
-                   Sequence<>{},
-                   Sequence<1>{},
-                   Sequence<>{},
-                   Sequence<2>{},
-                   Sequence<3>{}));
-
-    const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
-        in_n_htildeslice_wtildeslice_c_grid_desc,
-        make_tuple(make_pass_through_transform(C),
-                   make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice))),
-        make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
-                      out_gemmk0_gemmn_gemmk1_grid_desc,
-                      in_gemmm_gemmn_grid_desc);
-}
-
-} // namespace ck
-#endif
diff --git a/include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
deleted file mode 100644
index 5391b595b..000000000
--- a/include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
+++ /dev/null
@@ -1,355 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
-#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// A: out
-// B: wei
-// C: in
-// Number of GEMMs = YTilde * XTilde
-// GemmM = N * HTildeSlice * WTildeSlice
-// GemmN = C
-// GemmK = K * YDotSlice * XDotSlice
-template <typename... Wei,
-          typename... In,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads,
-          typename IYTilde,
-          typename IXTilde,
-          index_t GemmK1Value>
-__host__ __device__ constexpr auto
-transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
-    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
-    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    IYTilde i_ytilde,
-    IXTilde i_xtilde,
-    Number<GemmK1Value>)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto GemmK1 = Number<GemmK1Value>{};
-
-    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
-    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
-    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
-
-    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
-    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
-
-    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
-    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
-
-    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
-    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
-    const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
-
-    const auto YTilde = ConvStrideH / GcdStrideDilationH;
-    const auto XTilde = ConvStrideW / GcdStrideDilationW;
-
-    const auto YDot = math::integer_divide_ceil(Y, YTilde);
-    const auto XDot = math::integer_divide_ceil(X, XTilde);
-
-    const auto HTilde = Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
-    const auto WTilde = Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
-
-    // only work on HTilde and WTilde that contribute to non-padding area of input tensor
-    const auto IHTildeSliceBegin = math::integer_divide_floor(
-        math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
-    const auto IWTildeSliceBegin = math::integer_divide_floor(
-        math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
-
-    const auto IHTildeSliceEnd =
-        math::min(HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
-    const auto IWTildeSliceEnd =
-        math::min(WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
-
-    const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
-    const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
-
-    // GemmK is different for each GEMM
-    const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
-    const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
-
-    const auto K1 = GemmK1;
-    const auto K0 = K / K1;
-
-    // A: output tensor
-    // this add padding check
-    const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
-        out_n_ho_wo_k_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pad_transform(Ho, I0, I0),
-                   make_pad_transform(Wo, I0, I0),
-                   make_pass_through_transform(K)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const auto out_n_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor(
-        out_n_hop_wop_k_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_embed_transform(make_tuple(YDot, HTilde),
-                                        make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
-                   make_embed_transform(make_tuple(XDot, WTilde),
-                                        make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
-                   make_pass_through_transform(K)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
-
-    const auto out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc =
-        transform_tensor_descriptor(
-            out_n_ydot_htilde_xdot_wtilde_k_grid_desc,
-            make_tuple(make_pass_through_transform(N),
-                       make_slice_transform(YDot, I0, YDotSlice),
-                       make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
-                       make_slice_transform(XDot, I0, XDotSlice),
-                       make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
-                       make_unmerge_transform(make_tuple(K0, K1))),
-            make_tuple(Sequence<0>{},
-                       Sequence<1>{},
-                       Sequence<2>{},
-                       Sequence<3>{},
-                       Sequence<4>{},
-                       Sequence<5>{}),
-            make_tuple(Sequence<0>{},
-                       Sequence<1>{},
-                       Sequence<2>{},
-                       Sequence<3>{},
-                       Sequence<4>{},
-                       Sequence<5, 6>{}));
-
-#if 1
-    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
-        out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc,
-        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
-                   make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
-                   make_pass_through_transform(K1)),
-        make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-#else
-    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
-        out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc,
-        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
-                   make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
-                   make_pass_through_transform(K1)),
-        make_tuple(Sequence<5, 1, 3>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-#endif
-
-    // B: weight tensor
-    const auto wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor(
-        wei_k_y_x_c_grid_desc,
-        make_tuple(make_pass_through_transform(K),
-                   make_embed_transform(make_tuple(YDot, YTilde),
-                                        make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
-                   make_embed_transform(make_tuple(XDot, XTilde),
-                                        make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
-
-    const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
-        transform_tensor_descriptor(wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
-                                               make_slice_transform(YDot, I0, YDotSlice),
-                                               make_slice_transform(XDot, I0, XDotSlice),
-                                               make_freeze_transform(i_ytilde),
-                                               make_freeze_transform(i_xtilde),
-                                               make_pass_through_transform(C)),
-                                    make_tuple(Sequence<0>{},
-                                               Sequence<1>{},
-                                               Sequence<3>{},
-                                               Sequence<2>{},
-                                               Sequence<4>{},
-                                               Sequence<5>{}),
-                                    make_tuple(Sequence<0, 1>{},
-                                               Sequence<2>{},
-                                               Sequence<3>{},
-                                               Sequence<>{},
-                                               Sequence<>{},
-                                               Sequence<4>{}));
-
-#if 1
-    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
-        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
-        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
-                   make_pass_through_transform(C),
-                   make_pass_through_transform(K1)),
-        make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-#else
-    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
-        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
-        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
-                   make_pass_through_transform(C),
-                   make_pass_through_transform(K1)),
-        make_tuple(Sequence<0, 2, 3>{}, Sequence<4>{}, Sequence<1>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-#endif
-
-    // C: input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
-        in_n_hi_wi_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const auto in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor(
-        in_n_hip_wip_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_embed_transform(make_tuple(YTilde, HTilde),
-                                        make_tuple(ConvDilationH, ConvStrideH)),
-                   make_embed_transform(make_tuple(XTilde, WTilde),
-                                        make_tuple(ConvDilationW, ConvStrideW)),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
-
-    const auto in_n_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor(
-        in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_freeze_transform(i_ytilde),
-                   make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
-                   make_freeze_transform(i_xtilde),
-                   make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{},
-                   Sequence<1>{},
-                   Sequence<2>{},
-                   Sequence<3>{},
-                   Sequence<4>{},
-                   Sequence<5>{}),
-        make_tuple(Sequence<0>{},
-                   Sequence<>{},
-                   Sequence<1>{},
-                   Sequence<>{},
-                   Sequence<2>{},
-                   Sequence<3>{}));
-
-    const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
-        in_n_htildeslice_wtildeslice_c_grid_desc,
-        make_tuple(make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
-                      wei_gemmk0_gemmn_gemmk1_grid_desc,
-                      in_gemmm_gemmn_grid_desc);
-}
-
-// A: out
-// B: wei
-// C: in
-// Number of GEMMs = 1
-// GemmM = N * Ho * Wo
-// GemmN = C
-// GemmK = K
-template <typename... Wei,
-          typename... In,
-          typename... Out,
-          typename ConvStrides,
-          index_t GemmK1Value>
-__host__ __device__ constexpr auto
-transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk_1x1(
-    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
-    const TensorDescriptor<Wei...>& /* wei_k_y_x_c_grid_desc */,
-    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const ConvStrides& conv_strides,
-    Number<GemmK1Value>)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto GemmK1 = Number<GemmK1Value>{};
-
-    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
-    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
-    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
-
-    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
-    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto K1 = GemmK1;
-    const auto K0 = K / K1;
-
-    // A: output tensor
-    const auto out_gemmk0_gemmm_gemmk1_grid_desc =
-        transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
-                                    make_tuple(make_pass_through_transform(N * Ho * Wo),
-                                               make_unmerge_transform(make_tuple(K0, K1))),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
-
-    // B: weight tensor
-    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(K, C)),
-        make_tuple(make_unmerge_transform(make_tuple(K0, K1)), make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-    // C: input tensor
-    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
-        in_n_hi_wi_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)),
-                   make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
-
-    const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
-        in_n_y_ho_x_wo_c_grid_desc,
-        make_tuple(make_freeze_transform(I0),
-                   make_freeze_transform(I0),
-                   make_merge_transform(make_tuple(N, Ho, Wo)),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<1>{}, Sequence<3>{}, Sequence<0, 2, 4>{}, Sequence<5>{}),
-        make_tuple(Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{}));
-
-    return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
-                      wei_gemmk0_gemmn_gemmk1_grid_desc,
-                      in_gemmm_gemmn_grid_desc);
-}
-
-} // namespace ck
-#endif
diff --git a/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw.hpp b/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw.hpp
deleted file mode 100644
index bb1dc239f..000000000
--- a/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_ATOMIC_NCHW_KCYX_NKHW_HPP
-#define CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_ATOMIC_NCHW_KCYX_NKHW_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// GemmM = K
-// GemmK = N * Ho * Wo
-// GemmN = C * Y * X
-template <typename... Wei,
-          typename... In,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads,
-          index_t GemmK1Value,
-          typename GemmKBatchType,
-          typename GemmKPadType>
-__host__ __device__ constexpr auto
-transform_backward_weight_convolution_into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw_pad(
-    const TensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
-    const TensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
-    const TensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    Number<GemmK1Value>,
-    GemmKBatchType GemmKBatch,
-    GemmKPadType GemmKPad)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto GemmK1 = Number<GemmK1Value>{};
-
-    const auto N = in_n_c_hi_wi_grid_desc.GetLength(I0);
-    const auto C = in_n_c_hi_wi_grid_desc.GetLength(I1);
-    const auto K = out_n_k_ho_wo_grid_desc.GetLength(I1);
-
-    const auto Hi = in_n_c_hi_wi_grid_desc.GetLength(I2);
-    const auto Wi = in_n_c_hi_wi_grid_desc.GetLength(I3);
-
-    const auto Ho = out_n_k_ho_wo_grid_desc.GetLength(I2);
-    const auto Wo = out_n_k_ho_wo_grid_desc.GetLength(I3);
-
-    const auto Y = wei_k_c_y_x_grid_desc.GetLength(I2);
-    const auto X = wei_k_c_y_x_grid_desc.GetLength(I3);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    const auto GemmM      = K;
-    const auto GemmN      = C * Y * X;
-    const auto GemmKTotal = N * Ho * Wo;
-    const index_t GemmK0  = GemmKPad / (GemmKBatch * GemmK1);
-
-    // A: output tensor
-    const auto out_gemmktotal_gemmm_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
-        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
-        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
-        out_gemmktotal_gemmm_grid_desc,
-        make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
-                   make_pass_through_transform(GemmM)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
-        out_gemmkpad_gemmm_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmM)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-
-    // B: input tensor
-    const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
-        in_n_c_hi_wi_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pass_through_transform(C),
-                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const auto in_n_c_y_ho_x_wo_grid_desc = transform_tensor_descriptor(
-        in_n_c_hip_wip_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pass_through_transform(C),
-                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
-                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-
-    const auto in_gemmktotal_gemmn_grid_desc =
-        transform_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc,
-                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
-                                               make_merge_transform(make_tuple(N, Ho, Wo))),
-                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-                                    make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
-        in_gemmktotal_gemmn_grid_desc,
-        make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
-                   make_pass_through_transform(GemmN)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
-        in_gemmkpad_gemmn_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmN)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-
-    // C: weight tensor
-    const auto wei_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
-        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                      in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                      wei_gemmm_gemmn_grid_desc);
-}
-
-} // namespace ck
-#endif
diff --git a/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp b/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
deleted file mode 100644
index ca530934e..000000000
--- a/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
-#define CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// GemmM = K
-// GemmK = N * Ho * Wo
-// GemmN = C * Y * X
-template <typename... Wei,
-          typename... In,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads,
-          index_t GemmK1Value>
-__host__ __device__ constexpr auto
-transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
-    const TensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
-    const TensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
-    const TensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    Number<GemmK1Value>)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto GemmK1 = Number<GemmK1Value>{};
-
-    const auto N = in_n_c_hi_wi_grid_desc.GetLength(I0);
-    const auto C = in_n_c_hi_wi_grid_desc.GetLength(I1);
-    const auto K = out_n_k_ho_wo_grid_desc.GetLength(I1);
-
-    const auto Hi = in_n_c_hi_wi_grid_desc.GetLength(I2);
-    const auto Wi = in_n_c_hi_wi_grid_desc.GetLength(I3);
-
-    const auto Ho = out_n_k_ho_wo_grid_desc.GetLength(I2);
-    const auto Wo = out_n_k_ho_wo_grid_desc.GetLength(I3);
-
-    const auto Y = wei_k_c_y_x_grid_desc.GetLength(I2);
-    const auto X = wei_k_c_y_x_grid_desc.GetLength(I3);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    const auto GemmM  = K;
-    const auto GemmN  = C * Y * X;
-    const auto GemmK  = N * Ho * Wo;
-    const auto GemmK0 = GemmK / GemmK1;
-
-    // weight tensor
-    const auto wei_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
-        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    // input tensor
-    const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
-        in_n_c_hi_wi_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pass_through_transform(C),
-                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const auto in_n_c_y_ho_x_wo_grid_desc = transform_tensor_descriptor(
-        in_n_c_hip_wip_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pass_through_transform(C),
-                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
-                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-
-    const auto in_gemmk_gemmn_grid_desc =
-        transform_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc,
-                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
-                                               make_merge_transform(make_tuple(N, Ho, Wo))),
-                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-                                    make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    const auto in_gemmk0_gemmn_gemmk1_grid_desc =
-        transform_tensor_descriptor(in_gemmk_gemmn_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                                               make_pass_through_transform(GemmN)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-    // output tensor
-    const auto out_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
-        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
-        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    const auto out_gemmk0_gemmm_gemmk1_grid_desc =
-        transform_tensor_descriptor(out_gemmk_gemmm_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                                               make_pass_through_transform(GemmM)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-    return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
-                      in_gemmk0_gemmn_gemmk1_grid_desc,
-                      wei_gemmm_gemmn_grid_desc);
-}
-
-} // namespace ck
-#endif
diff --git a/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_atomic_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_atomic_nhwc_kyxc_nhwk.hpp
deleted file mode 100644
index e960f90c4..000000000
--- a/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_atomic_nhwc_kyxc_nhwk.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R4_ATOMIC_NHWC_KYXC_NHWK_HPP
-#define CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R4_ATOMIC_NHWC_KYXC_NHWK_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// A: in
-// B: wei
-// C: out
-// GemmM = N * Ho * Wo
-// GemmN = K
-// GemmK = Y * X * C
-template <typename... In,
-          typename... Wei,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads,
-          index_t GemmK1Value,
-          typename GemmKBatchType,
-          typename GemmKPadType>
-__host__ __device__ constexpr auto
-transform_backward_weight_convolution_into_gemm_v4r4r4_atomic_nhwc_kyxc_nhwk_pad(
-    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    Number<GemmK1Value>,
-    GemmKBatchType GemmKBatch,
-    GemmKPadType GemmKPad)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto GemmK1 = Number<GemmK1Value>{};
-
-    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
-    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
-    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
-
-    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
-    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
-
-    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
-    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
-
-    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
-    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    const auto GemmM      = Y * X * C;
-    const auto GemmN      = K;
-    const auto GemmKTotal = N * Ho * Wo;
-    const index_t GemmK0  = GemmKPad / (GemmKBatch * GemmK1);
-
-    // A: input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
-        in_n_hi_wi_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
-        in_n_hip_wip_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
-                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
-
-    const auto in_gemmktotal_gemmm_grid_desc =
-        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
-                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
-                                               make_merge_transform(make_tuple(N, Ho, Wo))),
-                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
-                                    make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    const auto in_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
-        in_gemmktotal_gemmm_grid_desc,
-        make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
-                   make_pass_through_transform(GemmM)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto in_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
-        in_gemmkpad_gemmm_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmM)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-
-    // B: output tensor
-    const auto out_gemmktotal_gemmn_grid_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
-
-    const auto out_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
-        out_gemmktotal_gemmn_grid_desc,
-        make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
-                   make_pass_through_transform(GemmN)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto out_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
-        out_gemmkpad_gemmn_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmN)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-
-    // C: weight tensor
-    const auto wei_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
-        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    return make_tuple(in_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                      out_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                      wei_gemmm_gemmn_grid_desc);
-}
-
-} // namespace ck
-#endif
diff --git a/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
deleted file mode 100644
index 052bab423..000000000
--- a/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
+++ /dev/null
@@ -1,135 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
-#define CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// A: in
-// B: wei
-// C: out
-// GemmM = N * Ho * Wo
-// GemmN = K
-// GemmK = Y * X * C
-template <typename... In,
-          typename... Wei,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads,
-          index_t GemmK1Value>
-__host__ __device__ constexpr auto
-transform_backward_weight_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
-    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    Number<GemmK1Value>)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto GemmK1 = Number<GemmK1Value>{};
-
-    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
-    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
-    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
-
-    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
-    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
-
-    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
-    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
-
-    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
-    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    const auto GemmM  = Y * X * C;
-    const auto GemmN  = K;
-    const auto GemmK  = N * Ho * Wo;
-    const auto GemmK0 = GemmK / GemmK1;
-
-    // A: input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
-        in_n_hi_wi_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
-        in_n_hip_wip_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
-                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
-
-    const auto in_gemmk_gemmm_grid_desc =
-        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
-                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
-                                               make_merge_transform(make_tuple(N, Ho, Wo))),
-                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
-                                    make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    const auto in_gemmk0_gemmm_gemmk1_grid_desc =
-        transform_tensor_descriptor(in_gemmk_gemmm_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                                               make_pass_through_transform(GemmM)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-    // B: output tensor
-    const auto out_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
-        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto out_gemmk0_gemmn_gemmk1_grid_desc =
-        transform_tensor_descriptor(out_gemmk_gemmn_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                                               make_pass_through_transform(GemmN)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-    // C: weight tensor
-    const auto wei_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
-        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
-                      out_gemmk0_gemmn_gemmk1_grid_desc,
-                      wei_gemmm_gemmn_grid_desc);
-}
-
-} // namespace ck
-#endif
diff --git a/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r5_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r5_nhwc_kyxc_nhwk.hpp
deleted file mode 100644
index c301a9e0c..000000000
--- a/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r5_nhwc_kyxc_nhwk.hpp
+++ /dev/null
@@ -1,147 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R5_NHWC_KYXC_NHWK_HPP
-#define CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R5_NHWC_KYXC_NHWK_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// A: out
-// B: in
-// C: wei
-// GemmM = K
-// GemmN = Y * X * C
-// GemmKTotal = N * Ho * Wo
-template <typename... In,
-          typename... Wei,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads,
-          index_t GemmK1Value,
-          typename GemmKBatchType,
-          typename GemmKPadType>
-__host__ __device__ constexpr auto
-transform_backward_weight_convolution_into_gemm_v4r4r5_nhwc_kyxc_nhwk_pad(
-    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    Number<GemmK1Value>,
-    GemmKBatchType GemmKBatch,
-    GemmKPadType GemmKPad)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto GemmK1 = Number<GemmK1Value>{};
-
-    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
-    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
-    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
-
-    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
-    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
-
-    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
-    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
-
-    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
-    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    const auto GemmM      = K;
-    const auto GemmN      = Y * X * C;
-    const auto GemmKTotal = N * Ho * Wo;
-    const index_t GemmK0  = GemmKPad / (GemmKBatch * GemmK1);
-
-    // A: output tensor
-    const auto out_gemmktotal_gemmm_grid_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
-
-    const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
-        out_gemmktotal_gemmm_grid_desc,
-        make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
-                   make_pass_through_transform(GemmM)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
-        out_gemmkpad_gemmm_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmM)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-
-    // B: input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
-        in_n_hi_wi_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
-        in_n_hip_wip_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
-                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
-
-    const auto in_gemmktotal_gemmn_grid_desc =
-        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
-                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
-                                               make_merge_transform(make_tuple(N, Ho, Wo))),
-                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
-                                    make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
-        in_gemmktotal_gemmn_grid_desc,
-        make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
-                   make_pass_through_transform(GemmN)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
-        in_gemmkpad_gemmn_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmN)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-
-    // C: weight tensor
-    const auto wei_gemmm_gemmn_grid_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
-
-    return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                      in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                      wei_gemmm_gemmn_grid_desc);
-}
-
-} // namespace ck
-#endif
diff --git a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
deleted file mode 100644
index 381f9ac9d..000000000
--- a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ /dev/null
@@ -1,260 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
-#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// GemmM = K
-// GemmN = N * Ho * Wo
-// GemmK = C * Y * X
-template <typename... Wei,
-          typename... In,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(
-    const TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-    const TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-    const TensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
-    const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
-    const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
-
-    const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-    const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
-
-    const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
-    const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
-
-    const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
-    const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    // weight tensor
-    const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
-        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    // input tensor
-    const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
-        in_n_c_hi_wi_global_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pass_through_transform(C),
-                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
-        in_n_c_hip_wip_global_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pass_through_transform(C),
-                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
-                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-
-    const auto in_gemmk_gemmn_global_desc =
-        transform_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
-                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
-                                               make_merge_transform(make_tuple(N, Ho, Wo))),
-                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    // output tensor
-    const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
-        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
-        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    return make_tuple(
-        wei_gemmk_gemmm_global_desc, in_gemmk_gemmn_global_desc, out_gemmm_gemmn_global_desc);
-}
-
-template <typename... Wei,
-          typename... In,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-__host__ __device__ constexpr auto
-transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad(
-    const TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-    const TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-    const TensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
-    const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
-    const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
-
-    const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
-    const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
-
-    const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
-    const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    assert(InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 && InRightPadW == 0);
-
-    // weight tensor
-    const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
-        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    // input tensor
-    const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
-        in_n_c_hi_wi_global_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pass_through_transform(C),
-                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
-                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-
-    const auto in_gemmk_gemmn_global_desc =
-        transform_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
-                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
-                                               make_merge_transform(make_tuple(N, Ho, Wo))),
-                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    // output tensor
-    const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
-        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
-        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    return make_tuple(
-        wei_gemmk_gemmm_global_desc, in_gemmk_gemmn_global_desc, out_gemmm_gemmn_global_desc);
-}
-
-template <typename... Wei,
-          typename... In,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_1x1(
-    const TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-    const TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-    const TensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
-    const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
-    const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
-
-    const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
-    const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
-
-    const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
-    const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    assert(Y == 1 && X == 1 && ConvStrideH == 1 && ConvStrideW == 1 && ConvDilationH == 1 &&
-           ConvDilationW == 1 && InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 &&
-           InRightPadW == 0);
-
-    // weight tensor
-    const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(K, C)),
-        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    // input tensor
-    const auto in_gemmk_gemmn_global_desc = transform_tensor_descriptor(
-        in_n_c_hi_wi_global_desc,
-        make_tuple(make_pass_through_transform(C), make_merge_transform(make_tuple(N, Ho, Wo))),
-        make_tuple(Sequence<1>{}, Sequence<0, 2, 3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    // output tensor
-    const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
-        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
-        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    return make_tuple(
-        wei_gemmk_gemmm_global_desc, in_gemmk_gemmn_global_desc, out_gemmm_gemmn_global_desc);
-}
-
-} // namespace ck
-#endif
diff --git a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
deleted file mode 100644
index ebfaabb03..000000000
--- a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+++ /dev/null
@@ -1,179 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NHWC_KYXC_NHWK_HPP
-#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NHWC_KYXC_NHWK_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// GemmM = K
-// GemmN = N * Ho * Wo
-// GemmK = C * Y * X
-template <typename... Wei,
-          typename... In,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_pad(
-    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
-    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
-    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
-
-    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
-    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
-
-    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
-    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
-
-    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
-    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    // weight tensor
-    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
-        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    // input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
-        in_n_hi_wi_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
-        in_n_hip_wip_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
-                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
-
-    const auto in_gemmk_gemmn_grid_desc =
-        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
-                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
-                                               make_merge_transform(make_tuple(N, Ho, Wo))),
-                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    // output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
-        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    return make_tuple(
-        wei_gemmk_gemmm_grid_desc, in_gemmk_gemmn_grid_desc, out_gemmm_gemmn_grid_desc);
-}
-
-template <typename... Wei,
-          typename... In,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_1x1(
-    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
-    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
-    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
-
-    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
-    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
-
-    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
-    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    assert(Y == 1 && X == 1 && ConvStrideH == 1 && ConvStrideW == 1 && ConvDilationH == 1 &&
-           ConvDilationW == 1 && InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 &&
-           InRightPadW == 0);
-
-    // weight tensor
-    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(K, C)),
-        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    // input tensor
-    const auto in_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, C)),
-        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    // output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
-        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    return make_tuple(
-        wei_gemmk_gemmm_grid_desc, in_gemmk_gemmn_grid_desc, out_gemmm_gemmn_grid_desc);
-}
-
-} // namespace ck
-#endif
diff --git a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
deleted file mode 100644
index 6e576d69f..000000000
--- a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
-#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// GemmM = K
-// GemmN = N * Ho * Wo
-// GemmK = C * Y * X
-template <typename... Wei,
-          typename... In,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads,
-          index_t GemmK1Value>
-__host__ __device__ constexpr auto
-transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
-    const TensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
-    const TensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
-    const TensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    Number<GemmK1Value>)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto GemmK1 = Number<GemmK1Value>{};
-
-    const auto N = in_n_c_hi_wi_grid_desc.GetLength(I0);
-    const auto C = in_n_c_hi_wi_grid_desc.GetLength(I1);
-    const auto K = out_n_k_ho_wo_grid_desc.GetLength(I1);
-
-    const auto Hi = in_n_c_hi_wi_grid_desc.GetLength(I2);
-    const auto Wi = in_n_c_hi_wi_grid_desc.GetLength(I3);
-
-    const auto Ho = out_n_k_ho_wo_grid_desc.GetLength(I2);
-    const auto Wo = out_n_k_ho_wo_grid_desc.GetLength(I3);
-
-    const auto Y = wei_k_c_y_x_grid_desc.GetLength(I2);
-    const auto X = wei_k_c_y_x_grid_desc.GetLength(I3);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    const auto GemmM  = K;
-    const auto GemmN  = N * Ho * Wo;
-    const auto GemmK  = C * Y * X;
-    const auto GemmK0 = GemmK / GemmK1;
-
-    // weight tensor
-    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
-        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc =
-        transform_tensor_descriptor(wei_gemmk_gemmm_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                                               make_pass_through_transform(GemmM)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-    // input tensor
-    const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
-        in_n_c_hi_wi_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pass_through_transform(C),
-                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const auto in_n_c_y_ho_x_wo_grid_desc = transform_tensor_descriptor(
-        in_n_c_hip_wip_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pass_through_transform(C),
-                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
-                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-
-    const auto in_gemmk_gemmn_grid_desc =
-        transform_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc,
-                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
-                                               make_merge_transform(make_tuple(N, Ho, Wo))),
-                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto in_gemmk0_gemmn_gemmk1_grid_desc =
-        transform_tensor_descriptor(in_gemmk_gemmn_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                                               make_pass_through_transform(GemmN)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-    // output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
-        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
-        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
-                      in_gemmk0_gemmn_gemmk1_grid_desc,
-                      out_gemmm_gemmn_grid_desc);
-}
-
-} // namespace ck
-#endif
diff --git a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
deleted file mode 100644
index 13e1bf251..000000000
--- a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NHWC_KYXC_NHWK_HPP
-#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NHWC_KYXC_NHWK_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// GemmM = K
-// GemmN = N * Ho * Wo
-// GemmK = C * Y * X
-template <typename... Wei,
-          typename... In,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads,
-          index_t GemmK1Value>
-__host__ __device__ constexpr auto
-transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(
-    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    Number<GemmK1Value>)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto GemmK1 = Number<GemmK1Value>{};
-
-    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
-    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
-    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
-
-    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
-    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
-
-    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
-    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
-
-    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
-    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    const auto GemmM  = K;
-    const auto GemmN  = N * Ho * Wo;
-    const auto GemmK  = C * Y * X;
-    const auto GemmK0 = GemmK / GemmK1;
-
-    // weight tensor
-    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
-        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc =
-        transform_tensor_descriptor(wei_gemmk_gemmm_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                                               make_pass_through_transform(GemmM)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-    // input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
-        in_n_hi_wi_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
-        in_n_hip_wip_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
-                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
-
-    const auto in_gemmk_gemmn_grid_desc =
-        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
-                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
-                                               make_merge_transform(make_tuple(N, Ho, Wo))),
-                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto in_gemmk0_gemmn_gemmk1_grid_desc =
-        transform_tensor_descriptor(in_gemmk_gemmn_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                                               make_pass_through_transform(GemmN)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-    // output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
-        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
-                      in_gemmk0_gemmn_gemmk1_grid_desc,
-                      out_gemmm_gemmn_grid_desc);
-}
-
-} // namespace ck
-#endif
diff --git a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
deleted file mode 100644
index 088d14b2e..000000000
--- a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
+++ /dev/null
@@ -1,134 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
-#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// A: in
-// B: wei
-// C: out
-// GemmM = N * Ho * Wo
-// GemmN = K
-// GemmK = Y * X * C
-template <typename... In,
-          typename... Wei,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads,
-          index_t GemmK1Value>
-__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk(
-    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    Number<GemmK1Value>)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto GemmK1 = Number<GemmK1Value>{};
-
-    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
-    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
-    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
-
-    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
-    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
-
-    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
-    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
-
-    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
-    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    const auto GemmM  = N * Ho * Wo;
-    const auto GemmN  = K;
-    const auto GemmK  = Y * X * C;
-    const auto GemmK0 = GemmK / GemmK1;
-
-    // A: input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
-        in_n_hi_wi_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
-        in_n_hip_wip_c_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
-                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
-                   make_pass_through_transform(C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
-
-    const auto in_gemmk_gemmm_grid_desc =
-        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
-                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
-                                               make_merge_transform(make_tuple(N, Ho, Wo))),
-                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto in_gemmk0_gemmm_gemmk1_grid_desc =
-        transform_tensor_descriptor(in_gemmk_gemmm_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                                               make_pass_through_transform(GemmM)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-    // B: weight tensor
-    const auto wei_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
-        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    const auto wei_gemmk0_gemmn_gemmk1_grid_desc =
-        transform_tensor_descriptor(wei_gemmk_gemmn_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                                               make_pass_through_transform(GemmN)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-    // C: output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
-        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
-        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
-                      wei_gemmk0_gemmn_gemmk1_grid_desc,
-                      out_gemmm_gemmn_grid_desc);
-}
-
-} // namespace ck
-#endif
diff --git a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
deleted file mode 100644
index a6785d56d..000000000
--- a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
+++ /dev/null
@@ -1,135 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
-#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// GemmM0 = 1
-// GemmM1 = K
-// GemmN0 = N0
-// GemmN1 = (N / N0) * Ho * Wo
-// GemmK0 = (C / C0) * Y * X
-// GemmK1 = C0
-template <typename... Wei,
-          typename... In,
-          typename... Out,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads,
-          typename N0Type,
-          typename C0Type>
-__host__ __device__ constexpr auto
-transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
-    const TensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
-    const TensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
-    const TensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const N0Type& N0,
-    const C0Type& C0)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const auto N = in_n_c_hi_wi_grid_desc.GetLength(I0);
-    const auto C = in_n_c_hi_wi_grid_desc.GetLength(I1);
-    const auto K = out_n_k_ho_wo_grid_desc.GetLength(I1);
-
-    const auto Hi = in_n_c_hi_wi_grid_desc.GetLength(I2);
-    const auto Wi = in_n_c_hi_wi_grid_desc.GetLength(I3);
-
-    const auto Ho = out_n_k_ho_wo_grid_desc.GetLength(I2);
-    const auto Wo = out_n_k_ho_wo_grid_desc.GetLength(I3);
-
-    const auto Y = wei_k_c_y_x_grid_desc.GetLength(I2);
-    const auto X = wei_k_c_y_x_grid_desc.GetLength(I3);
-
-    const auto ConvStrideH = conv_strides[I0];
-    const auto ConvStrideW = conv_strides[I1];
-
-    const auto ConvDilationH = conv_dilations[I0];
-    const auto ConvDilationW = conv_dilations[I1];
-
-    const auto InLeftPadH = in_left_pads[I0];
-    const auto InLeftPadW = in_left_pads[I1];
-
-    const auto InRightPadH = in_right_pads[I0];
-    const auto InRightPadW = in_right_pads[I1];
-
-    const auto N1 = N / N0;
-    const auto C1 = C / C0;
-
-    // weight tensor
-    const auto wei_gk0_gm0_gm1_gk1_grid_desc =
-        transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
-                                    make_tuple(make_unmerge_transform(make_tuple(I1, K)),
-                                               make_unmerge_transform(make_tuple(C0, C1 * Y * X))),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<1, 2>{}, Sequence<3, 0>{}));
-
-    // input tensor
-    const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
-        in_n_c_hi_wi_grid_desc,
-        make_tuple(make_pass_through_transform(N),
-                   make_pass_through_transform(C),
-                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const auto in_n0_n1_c0_c1_y_ho_x_wo_grid_desc = transform_tensor_descriptor(
-        in_n_c_hip_wip_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(N0, N1)),
-                   make_unmerge_transform(make_tuple(C0, C1)),
-                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
-                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6, 7>{}));
-
-    const auto in_gk0_gn0_gn1_gk1_grid_desc = transform_tensor_descriptor(
-        in_n0_n1_c0_c1_y_ho_x_wo_grid_desc,
-        make_tuple(make_merge_transform(make_tuple(C1, Y, X)),
-                   make_pass_through_transform(N0),
-                   make_merge_transform(make_tuple(N1, Ho, Wo)),
-                   make_pass_through_transform(C0)),
-        make_tuple(Sequence<3, 4, 6>{}, Sequence<0>{}, Sequence<1, 5, 7>{}, Sequence<2>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    // output tensor
-    const auto out_n_k_howo_grid_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo));
-
-    const auto out_n0_n1_1_k_howo_grid_desc =
-        transform_tensor_descriptor(out_n_k_howo_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(N0, N1)),
-                                               make_unmerge_transform(make_tuple(I1, K)),
-                                               make_pass_through_transform(Ho * Wo)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                                    make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4>{}));
-
-    const auto out_gm0_gm1_gn0_gn1_grid_desc = transform_tensor_descriptor(
-        out_n0_n1_1_k_howo_grid_desc,
-        make_tuple(make_pass_through_transform(I1),
-                   make_pass_through_transform(K),
-                   make_pass_through_transform(N0),
-                   make_merge_transform_v2_magic_division(make_tuple(N1, Ho * Wo))),
-        make_tuple(Sequence<2>{}, Sequence<3>{}, Sequence<0>{}, Sequence<1, 4>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    return make_tuple(
-        wei_gk0_gm0_gm1_gk1_grid_desc, in_gk0_gn0_gn1_gk1_grid_desc, out_gm0_gm1_gn0_gn1_grid_desc);
-}
-
-} // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_e_permute_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_e_permute_xdl.hpp
deleted file mode 100644
index 9f9fe0f1c..000000000
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_e_permute_xdl.hpp
+++ /dev/null
@@ -1,586 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-
-#include "ck/utility/common_header.hpp"
-#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
-#include "ck/host_utility/device_prop.hpp"
-#include "ck/host_utility/kernel_launch.hpp"
-
-namespace ck {
-
-template <typename GridwiseGemm,
-          typename FloatAB,
-          typename FloatDsPointer,
-          typename FloatE,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          typename AGridDesc_AK0_M_AK1,
-          typename BGridDesc_BK0_N_BK1,
-          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename Block2ETileMap,
-          bool HasMainKBlockLoop>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        kernel_gemm_bias_e_permute(const FloatAB* __restrict__ p_a_grid,
-                                   const FloatAB* __restrict__ p_b_grid,
-                                   FloatDsPointer p_ds_grid,
-                                   FloatE* __restrict__ p_e_grid,
-                                   const AElementwiseOperation a_element_op,
-                                   const BElementwiseOperation b_element_op,
-                                   const CDEElementwiseOperation cde_element_op,
-                                   const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-                                   const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-                                   const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                                       ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                   const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                                       e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                   const Block2ETileMap block_2_etile_map)
-{
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_ds_grid,
-                                                  p_e_grid,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  cde_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_etile_map);
-#else
-    ignore = p_a_grid;
-    ignore = p_b_grid;
-    ignore = p_ds_grid;
-    ignore = p_e_grid;
-    ignore = a_element_op;
-    ignore = b_element_op;
-    ignore = cde_element_op;
-    ignore = a_grid_desc_ak0_m_ak1;
-    ignore = b_grid_desc_bk0_n_bk1;
-    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore = block_2_etile_map;
-#endif
-}
-
-} // namespace ck
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-
-// input : A[M, K], or A[K, N]
-// input : B[K, N], or A[N, K]
-// input : D0[M, N], D1[M, N], ...
-// output : E[M, N]
-// C = a_op(A) * b_op(B)
-// E = cde_op(C, D0, D1, ...)
-template <typename ALayout,
-          typename BLayout,
-          typename CDELayout,
-          typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CShuffleDataType,
-          typename DDataType,
-          typename EDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          GemmSpecialization GemmSpec,
-          index_t NumGemmKPrefetchStage,
-          index_t BlockSize,
-          index_t MPerBlock,
-          index_t NPerBlock,
-          index_t KPerBlock,
-          index_t AK1,
-          index_t BK1,
-          index_t MPerXDL,
-          index_t NPerXDL,
-          index_t MXdlPerWave,
-          index_t NXdlPerWave,
-          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          index_t ABlockTransferSrcVectorDim,
-          index_t ABlockTransferSrcScalarPerVector,
-          index_t ABlockTransferDstScalarPerVector_AK1,
-          index_t ABlockLdsExtraM,
-          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
-          typename BBlockTransferThreadClusterArrangeOrder,
-          typename BBlockTransferSrcAccessOrder,
-          index_t BBlockTransferSrcVectorDim,
-          index_t BBlockTransferSrcScalarPerVector,
-          index_t BBlockTransferDstScalarPerVector_BK1,
-          index_t BBlockLdsExtraN,
-          index_t CShuffleMXdlPerWavePerShuffle,
-          index_t CShuffleNXdlPerWavePerShuffle,
-          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-          index_t CDEBlockTransferScalarPerVector_NPerBlock,
-          LoopScheduler LoopSched = make_default_loop_scheduler()>
-struct DeviceGemmBiasEPermute_Xdl : public DeviceGemmBiasCPermute<AElementwiseOperation,
-                                                                  BElementwiseOperation,
-                                                                  CDEElementwiseOperation>
-{
-    using DeviceOp = DeviceGemmBiasEPermute_Xdl;
-
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-    static constexpr auto I2 = Number<2>{};
-    static constexpr auto I3 = Number<3>{};
-
-    static constexpr auto matrix_padder =
-        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
-
-    static constexpr index_t NumDTensor = 1;
-
-    static auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideA)
-    {
-        const auto a_grid_desc_mraw_kraw = [&]() {
-            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
-                                                    make_tuple(StrideA, I1));
-            }
-            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
-                                                    make_tuple(I1, StrideA));
-            }
-        }();
-
-        return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
-    }
-
-    static auto MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB)
-    {
-        const auto b_grid_desc_nraw_kraw = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
-                                                    make_tuple(I1, StrideB));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
-                                                    make_tuple(StrideB, I1));
-            }
-        }();
-
-        return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
-    }
-
-    static auto MakeEGridDescriptor_M_N(DEGridDesc_M0_M1_M2_N0_N1 d_e_grid_desc)
-    {
-        index_t M0 = d_e_grid_desc.M0_;
-        index_t M1 = d_e_grid_desc.M1_;
-        index_t M2 = d_e_grid_desc.M2_;
-        index_t N0 = d_e_grid_desc.N0_;
-        index_t N1 = d_e_grid_desc.N1_;
-
-        index_t stride_M0 = d_e_grid_desc.stride_M0_;
-        index_t stride_M1 = d_e_grid_desc.stride_M1_;
-        index_t stride_M2 = d_e_grid_desc.stride_M2_;
-        index_t stride_N0 = d_e_grid_desc.stride_N0_;
-        index_t stride_N1 = d_e_grid_desc.stride_N1_;
-
-        const auto e_grid_desc_mraw_nraw = [&]() {
-            const auto e_grid_desc_m0_m1_m2_n0_n1 = make_naive_tensor_descriptor(
-                make_tuple(M0, M1, M2, N0, N1),
-                make_tuple(stride_M0, stride_M1, stride_M2, stride_N0, stride_N1));
-
-            return transform_tensor_descriptor(
-                e_grid_desc_m0_m1_m2_n0_n1,
-                make_tuple(make_merge_transform(make_tuple(M0, M1, M2)),
-                           make_merge_transform(make_tuple(N0, N1))),
-                make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }();
-
-        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
-    }
-
-    using AGridDesc_M_K = decltype(MakeAGridDescriptor_M_K(1, 1, 1));
-    using BGridDesc_N_K = decltype(MakeBGridDescriptor_N_K(1, 1, 1));
-    using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N(DEGridDesc_M0_M1_M2_N0_N1{}));
-
-    using DsGridDesc_M_N = Tuple<EGridDesc_M_N>;
-
-    // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
-        ADataType, // TODO: distinguish A/B datatype
-        AccDataType,
-        CShuffleDataType,
-        ck::Tuple<DDataType>,
-        EDataType,
-        AElementwiseOperation,
-        BElementwiseOperation,
-        CDEElementwiseOperation,
-        InMemoryDataOperationEnum::Set,
-        NumGemmKPrefetchStage,
-        BlockSize,
-        MPerBlock,
-        NPerBlock,
-        KPerBlock,
-        AK1,
-        BK1,
-        MPerXDL,
-        NPerXDL,
-        MXdlPerWave,
-        NXdlPerWave,
-        ABlockTransferThreadClusterLengths_AK0_M_AK1,
-        ABlockTransferThreadClusterArrangeOrder,
-        ABlockTransferSrcAccessOrder,
-        ABlockTransferSrcVectorDim,
-        ABlockTransferSrcScalarPerVector,
-        ABlockTransferDstScalarPerVector_AK1,
-        false,
-        ABlockLdsExtraM,
-        BBlockTransferThreadClusterLengths_BK0_N_BK1,
-        BBlockTransferThreadClusterArrangeOrder,
-        BBlockTransferSrcAccessOrder,
-        BBlockTransferSrcVectorDim,
-        BBlockTransferSrcScalarPerVector,
-        BBlockTransferDstScalarPerVector_BK1,
-        false,
-        BBlockLdsExtraN,
-        CShuffleMXdlPerWavePerShuffle,
-        CShuffleNXdlPerWavePerShuffle,
-        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-        CDEBlockTransferScalarPerVector_NPerBlock,
-        LoopSched>;
-
-    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
-        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
-    using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
-        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
-
-    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
-
-    // Argument
-    struct Argument : public BaseArgument
-    {
-        Argument(const void* p_a_grid,
-                 const void* p_b_grid,
-                 const void* p_d_grid,
-                 void* p_e_grid,
-                 index_t MRaw,
-                 index_t NRaw,
-                 index_t KRaw,
-                 index_t StrideA,
-                 index_t StrideB,
-                 DEGridDesc_M0_M1_M2_N0_N1 d_grid_desc,
-                 DEGridDesc_M0_M1_M2_N0_N1 e_grid_desc,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : p_a_grid_{static_cast<const ADataType*>(p_a_grid)},
-              p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
-              p_ds_grid_{},
-              p_e_grid_{static_cast<EDataType*>(p_e_grid)},
-              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(MRaw, KRaw, StrideA)},
-              b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(KRaw, NRaw, StrideB)},
-              ds_grid_desc_m_n_{},
-              e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(e_grid_desc)},
-              a_grid_desc_ak0_m_ak1_{
-                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
-              b_grid_desc_bk0_n_bk1_{
-                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
-              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-
-            if(MRaw != d_grid_desc.M0_ * d_grid_desc.M1_ * d_grid_desc.M2_)
-            {
-                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
-            }
-
-            if(NRaw != d_grid_desc.N0_ * d_grid_desc.N1_)
-            {
-                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
-            }
-
-            // populate pointer, desc for Ds
-            // D pointer
-            p_ds_grid_(I0) = static_cast<const DDataType*>(p_d_grid);
-
-            // D desc
-            ds_grid_desc_m_n_(I0) = DeviceOp::MakeEGridDescriptor_M_N(d_grid_desc);
-
-            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
-                                           b_grid_desc_n_k_,
-                                           ds_grid_desc_m_n_,
-                                           e_grid_desc_m_n_,
-                                           block_2_etile_map_))
-            {
-                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        e_grid_desc_m_n_);
-
-                ds_grid_desc_mblock_mperblock_nblock_nperblock_(I0) =
-                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        ds_grid_desc_m_n_[I0]);
-            }
-        }
-
-        //  private:
-        // pointers
-        const ADataType* p_a_grid_;
-        const BDataType* p_b_grid_;
-        typename GridwiseGemm::DsGridPointer p_ds_grid_;
-        EDataType* p_e_grid_;
-
-        // tensor descriptors for problem definiton
-        AGridDesc_M_K a_grid_desc_m_k_;
-        BGridDesc_N_K b_grid_desc_n_k_;
-        DsGridDesc_M_N ds_grid_desc_m_n_;
-        EGridDesc_M_N e_grid_desc_m_n_;
-
-        // tensor descriptors for block/thread-wise copy
-        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
-        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            e_grid_desc_mblock_mperblock_nblock_nperblock_;
-
-        // block-to-e-tile map
-        Block2ETileMap block_2_etile_map_;
-
-        // element-wise op
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public BaseInvoker
-    {
-        using Argument = DeviceOp::Argument;
-
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                            arg.b_grid_desc_n_k_,
-                                            arg.ds_grid_desc_m_n_,
-                                            arg.e_grid_desc_m_n_,
-                                            arg.block_2_etile_map_))
-            {
-                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
-            }
-
-            const index_t grid_size =
-                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_);
-
-            const auto K =
-                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
-
-            auto launch_kernel = [&](auto has_main_k_block_loop) {
-                constexpr bool has_main_loop = has_main_k_block_loop.value;
-
-                const auto kernel = kernel_gemm_bias_e_permute<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    typename GridwiseGemm::DsGridPointer,
-                    EDataType,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CDEElementwiseOperation,
-                    DeviceOp::AGridDesc_AK0_M_AK1,
-                    DeviceOp::BGridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::DefaultBlock2ETileMap,
-                    has_main_loop>;
-
-                return launch_and_time_kernel(stream_config,
-                                              kernel,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              arg.p_a_grid_,
-                                              arg.p_b_grid_,
-                                              arg.p_ds_grid_,
-                                              arg.p_e_grid_,
-                                              arg.a_element_op_,
-                                              arg.b_element_op_,
-                                              arg.cde_element_op_,
-                                              arg.a_grid_desc_ak0_m_ak1_,
-                                              arg.b_grid_desc_bk0_n_bk1_,
-                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                              arg.block_2_etile_map_);
-            };
-
-            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
-            {
-                return launch_kernel(integral_constant<bool, true>{});
-            }
-            else
-            {
-                return launch_kernel(integral_constant<bool, false>{});
-            }
-        }
-
-        // polymorphic
-        float Run(const BaseArgument* p_arg,
-                  const StreamConfig& stream_config = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        }
-    };
-
-    static bool IsSupportedArgument(const Argument& arg)
-    {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
-        {
-            return false;
-        }
-
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                           arg.b_grid_desc_n_k_,
-                                           arg.ds_grid_desc_m_n_,
-                                           arg.e_grid_desc_m_n_,
-                                           arg.block_2_etile_map_);
-    }
-
-    // polymorphic
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
-    }
-
-    static auto MakeArgument(const void* p_a,
-                             const void* p_b,
-                             const void* p_d,
-                             void* p_e,
-                             index_t MRaw,
-                             index_t NRaw,
-                             index_t KRaw,
-                             index_t StrideA,
-                             index_t StrideB,
-                             DEGridDesc_M0_M1_M2_N0_N1 d_grid_desc,
-                             DEGridDesc_M0_M1_M2_N0_N1 e_grid_desc,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{p_a,
-                        p_b,
-                        p_d,
-                        p_e,
-                        MRaw,
-                        NRaw,
-                        KRaw,
-                        StrideA,
-                        StrideB,
-                        d_grid_desc,
-                        e_grid_desc,
-                        a_element_op,
-                        b_element_op,
-                        cde_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    // polymorphic
-    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const void* p_a,
-                        const void* p_b,
-                        const void* p_d,
-                        void* p_e,
-                        index_t MRaw,
-                        index_t NRaw,
-                        index_t KRaw,
-                        index_t StrideA,
-                        index_t StrideB,
-                        DEGridDesc_M0_M1_M2_N0_N1 d_grid_desc,
-                        DEGridDesc_M0_M1_M2_N0_N1 e_grid_desc,
-                        AElementwiseOperation a_element_op,
-                        BElementwiseOperation b_element_op,
-                        CDEElementwiseOperation cde_element_op) override
-    {
-        return std::make_unique<Argument>(p_a,
-                                          p_b,
-                                          p_d,
-                                          p_e,
-                                          MRaw,
-                                          NRaw,
-                                          KRaw,
-                                          StrideA,
-                                          StrideB,
-                                          d_grid_desc,
-                                          e_grid_desc,
-                                          a_element_op,
-                                          b_element_op,
-                                          cde_element_op);
-    }
-
-    // polymorphic
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    // polymorphic
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "DeviceGemmBiasEPermute_Xdl"
-            << "<"
-            << BlockSize << ", "
-            << MPerBlock << ", "
-            << NPerBlock << ", "
-            << KPerBlock << ", "
-            << AK1 << ", "
-            << BK1 << ", "
-            << K1 << ", "
-            << MPerXDL << ", "
-            << NPerXDL << ", "
-            << MXdlPerWave << ", "
-            << NXdlPerWave << ", "
-            << ABlockTransferSrcScalarPerVector << ", "
-            << ABlockTransferDstScalarPerVector_K1 << ", "
-            << BBlockTransferSrcScalarPerVector << ", "
-            << BBlockTransferDstScalarPerVector_K1 << ", "
-            << CShuffleMXdlPerWavePerShuffle << ", "
-            << CShuffleNXdlPerWavePerShuffle << ", "
-            << CBlockTransferScalarPerVector_NWaveNPerXdl
-            << ">";
-        // clang-format on
-
-        return str.str();
-    }
-};
-
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
deleted file mode 100644
index 2369f5179..000000000
--- a/include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
+++ /dev/null
@@ -1,662 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
-#define CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
-
-#include "common_header.hpp"
-#include "multi_index_transform_helper.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "blockwise_gemm_dlops_v2r3.hpp"
-#include "blockwise_tensor_slice_transfer_v2.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
-#include "threadwise_tensor_slice_set.hpp"
-
-namespace ck {
-
-template <typename GridwiseContraction,
-          typename FloatAB,
-          typename FloatC,
-          typename AGridDesc_GK0_GM0_GM10_GM11_GK1,
-          typename BGridDesc_GK0_GN0_GN10_GN11_GK1,
-          typename CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1,
-          typename CGridBlockCluster_BlockId_To_GM10_GN10,
-          bool HasMainKBlockLoop,
-          bool HasDoubleTailKBlockLoop>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        kernel_contraction_dlops_v1r2(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const AGridDesc_GK0_GM0_GM10_GM11_GK1 a_grid_desc_gk0_gm0_gm10_gm11_gk1,
-            const BGridDesc_GK0_GN0_GN10_GN11_GK1 b_grid_desc_gk0_gn0_gn10_gn11_gk1,
-            const CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
-            const CGridBlockCluster_BlockId_To_GM10_GN10 c_grid_block_cluster_blockid_to_gm10_gn10)
-{
-    constexpr index_t shared_block_size =
-        GridwiseContraction::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseContraction::Run(p_a_grid,
-                             p_b_grid,
-                             p_c_grid,
-                             p_shared_block,
-                             a_grid_desc_gk0_gm0_gm10_gm11_gk1,
-                             b_grid_desc_gk0_gn0_gn10_gn11_gk1,
-                             c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
-                             c_grid_block_cluster_blockid_to_gm10_gn10,
-                             integral_constant<bool, HasMainKBlockLoop>{},
-                             integral_constant<bool, HasDoubleTailKBlockLoop>{});
-}
-
-template <index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AGridDesc_GK0_GM0_GM1_GK1,
-          typename BGridDesc_GK0_GN0_GN1_GK1,
-          typename CGridDesc_GM0_GM1_GN0_GN1,
-          index_t GM1PerBlockGM11,
-          index_t GN1PerBlockGN11,
-          index_t GK0PerBlock,
-          index_t BM1PerThreadBM11,
-          index_t BN1PerThreadBN11,
-          index_t BK0PerThread,
-          typename BM10BN10ThreadClusterBM10Xs,
-          typename BM10BN10ThreadClusterBN10Xs,
-          typename ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
-          typename ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          typename ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-          typename ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
-          typename BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
-          typename BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
-          typename BBlockTransferThreadClusterArrangeOrder,
-          typename BBlockTransferSrcAccessOrder,
-          typename BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-          typename BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
-          typename CThreadTransferSrcDstAccessOrder,
-          index_t CThreadTransferSrcDstVectorDim,
-          index_t CThreadTransferDstScalarPerVector,
-          typename AGridStepHacks,
-          typename BGridStepHacks,
-          typename CGridStepHacks,
-          typename AGridMoveSliceWindowStepHacks,
-          typename BGridMoveSliceWindowStepHacks>
-struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
-{
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-    static constexpr auto I2 = Number<2>{};
-    static constexpr auto I3 = Number<3>{};
-
-    // GM0 and GN0 need to known at compile-time
-    static constexpr auto GM0 = CGridDesc_GM0_GM1_GN0_GN1{}.GetLength(I0);
-    static constexpr auto GN0 = CGridDesc_GM0_GM1_GN0_GN1{}.GetLength(I2);
-    static constexpr auto GK1 = AGridDesc_GK0_GM0_GM1_GK1{}.GetLength(I3);
-
-    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
-    {
-        // lds max alignment
-        // TODO: part of them should be moved into blockwise-gemm
-        // TODO: change this. I think it needs multi-dimensional alignment
-        constexpr auto max_lds_align = GK1;
-
-        // A matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
-            max_lds_align);
-
-        // B matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
-            max_lds_align);
-
-        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
-            a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize(), max_lds_align);
-
-        constexpr auto b_block_aligned_space_size = math::integer_least_multiple(
-            b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize(), max_lds_align);
-
-        return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
-    }
-
-    __host__ __device__ static constexpr bool
-    CheckValidity(const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1,
-                  const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1,
-                  const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
-    {
-        static_assert(is_known_at_compile_time<remove_cv_t<decltype(GM0)>>::value &&
-                          is_known_at_compile_time<remove_cv_t<decltype(GN0)>>::value,
-                      "wrong! GM0 and GN0 need to be known at compile-time");
-
-        const auto GM1 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I2);
-        const auto GN1 = b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I2);
-        const auto GK0 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
-
-        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
-
-        return (
-            (GM0 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I0) &&
-             GM1 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1) &&
-             GN0 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I2) &&
-             GN1 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3) &&
-             GM0 == a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I1) &&
-             GM1 == a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I2) &&
-             GN0 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I1) &&
-             GN1 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I2) &&
-             GK0 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I0) &&
-             GK1 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I3)) &&
-            (GM1 % GM1PerBlockGM11 == 0 && GN1 % GN1PerBlockGN11 == 0 && GK0 % GK0PerBlock == 0));
-    }
-
-    __host__ __device__ static constexpr index_t
-    CalculateGridSize(const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
-    {
-        const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
-        const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
-
-        constexpr index_t GM11 = GM1PerBlockGM11;
-        constexpr index_t GN11 = GN1PerBlockGN11;
-
-        const index_t GM10 = GM1 / GM11;
-        const index_t GN10 = GN1 / GN11;
-
-        const index_t grid_size = GM10 * GN10;
-
-        return grid_size;
-    }
-
-    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t GK0)
-    {
-        const bool has_main_k_block_loop = (GK0 + GK0PerBlock) / (2 * GK0PerBlock) > 1;
-
-        return has_main_k_block_loop;
-    }
-
-    __host__ __device__ static constexpr bool CalculateHasDoubleTailKBlockLoop(index_t GK0)
-    {
-        const bool has_double_tail_k_block_loop = (GK0 / GK0PerBlock) % 2 == 0;
-
-        return has_double_tail_k_block_loop;
-    }
-
-    __host__ __device__ static constexpr auto MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(
-        const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1)
-    {
-        const auto GK0 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
-        const auto GM1 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I2);
-
-        const auto GM11 = Number<GM1PerBlockGM11>{};
-        const auto GM10 = GM1 / GM11;
-
-        const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 = transform_tensor_descriptor(
-            a_grid_desc_gk0_gm0_gm1_gk1,
-            make_tuple(make_pass_through_transform(GK0),
-                       make_pass_through_transform(GM0),
-                       make_unmerge_transform(make_tuple(GM10, GM11)),
-                       make_pass_through_transform(GK1)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}));
-
-        return a_grid_desc_gk0_gm0_gm10_gm11_gk1;
-    }
-
-    __host__ __device__ static constexpr auto MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(
-        const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1)
-    {
-        const auto GK0 = b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I0);
-        const auto GN1 = b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I2);
-
-        const auto GN11 = Number<GN1PerBlockGN11>{};
-        const auto GN10 = GN1 / GN11;
-
-        const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 = transform_tensor_descriptor(
-            b_grid_desc_gk0_gn0_gn1_gk1,
-            make_tuple(make_pass_through_transform(GK0),
-                       make_pass_through_transform(GN0),
-                       make_unmerge_transform(make_tuple(GN10, GN11)),
-                       make_pass_through_transform(GK1)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}));
-
-        return b_grid_desc_gk0_gn0_gn10_gn11_gk1;
-    }
-
-    __host__ __device__ static constexpr auto MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(
-        const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
-    {
-        const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
-        const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
-
-        constexpr auto GM11 = Number<GM1PerBlockGM11>{};
-        constexpr auto GN11 = Number<GN1PerBlockGN11>{};
-
-        const auto GM10 = GM1 / GM11;
-        const auto GN10 = GN1 / GN11;
-
-        constexpr auto BM = GM0 * GM11;
-        constexpr auto BN = GN0 * GN11;
-
-        constexpr auto BM1 =
-            Number<container_reduce(BM10BN10ThreadClusterBM10Xs{}, math::multiplies{}, I1) *
-                   BM1PerThreadBM11>{};
-        constexpr auto BN1 =
-            Number<container_reduce(BM10BN10ThreadClusterBN10Xs{}, math::multiplies{}, I1) *
-                   BN1PerThreadBN11>{};
-
-        constexpr auto BM0 = BM / BM1;
-        constexpr auto BN0 = BN / BN1;
-
-        const auto c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc = transform_tensor_descriptor(
-            c_grid_desc_gm0_gm1_gn0_gn1,
-            make_tuple(make_pass_through_transform(GM0),
-                       make_unmerge_transform(make_tuple(GM10, GM11)),
-                       make_pass_through_transform(GN0),
-                       make_unmerge_transform(make_tuple(GN10, GN11))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}));
-
-        const auto c_gm10_bm_gn10_bn_grid_desc = transform_tensor_descriptor(
-            c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc,
-            make_tuple(make_pass_through_transform(GM10),
-                       make_merge_transform(make_tuple(GM0, GM11)),
-                       make_pass_through_transform(GN10),
-                       make_merge_transform(make_tuple(GN0, GN11))),
-            make_tuple(Sequence<1>{}, Sequence<0, 2>{}, Sequence<4>{}, Sequence<3, 5>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-        const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1 = transform_tensor_descriptor(
-            c_gm10_bm_gn10_bn_grid_desc,
-            make_tuple(make_pass_through_transform(GM10),
-                       make_unmerge_transform(make_tuple(BM0, BM1)),
-                       make_pass_through_transform(GN10),
-                       make_unmerge_transform(make_tuple(BN0, BN1))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}));
-
-        return c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1;
-    }
-
-    __host__ __device__ static constexpr auto MakeCGridBlockCluster_BlockId_To_GM10_GN10(
-        const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
-    {
-        const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
-        const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
-
-        constexpr auto GM11 = Number<GM1PerBlockGM11>{};
-        constexpr auto GN11 = Number<GN1PerBlockGN11>{};
-
-        const auto GM10 = GM1 / GM11;
-        const auto GN10 = GN1 / GN11;
-
-        const auto c_grid_block_cluster_blockid_to_gm10_gn10 = make_single_stage_tensor_adaptor(
-            make_tuple(make_merge_transform(make_tuple(GM10, GN10))),
-            make_tuple(Sequence<0, 1>{}),
-            make_tuple(Sequence<0>{}));
-
-        return c_grid_block_cluster_blockid_to_gm10_gn10;
-    }
-
-    using AGridDesc_GK0_GM0_GM10_GM11_GK1 =
-        decltype(MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(AGridDesc_GK0_GM0_GM1_GK1{}));
-    using BGridDesc_GK0_GN0_GN10_GN11_GK1 =
-        decltype(MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(BGridDesc_GK0_GN0_GN1_GK1{}));
-    using CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 =
-        decltype(MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(CGridDesc_GM0_GM1_GN0_GN1{}));
-    using CGridBlockCluster_BlockId_To_GM10_GN10 =
-        decltype(MakeCGridBlockCluster_BlockId_To_GM10_GN10(CGridDesc_GM0_GM1_GN0_GN1{}));
-
-    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
-    __device__ static void
-    Run(const FloatAB* __restrict__ p_a_grid,
-        const FloatAB* __restrict__ p_b_grid,
-        FloatC* __restrict__ p_c_grid,
-        FloatAB* __restrict__ p_shared_block,
-        const AGridDesc_GK0_GM0_GM10_GM11_GK1& a_grid_desc_gk0_gm0_gm10_gm11_gk1,
-        const BGridDesc_GK0_GN0_GN10_GN11_GK1& b_grid_desc_gk0_gn0_gn10_gn11_gk1,
-        const CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1& c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
-        const CGridBlockCluster_BlockId_To_GM10_GN10& c_grid_block_cluster_blockid_to_gm10_gn10,
-        integral_constant<bool, HasMainKBlockLoop>,
-        integral_constant<bool, HasDoubleTailKBlockLoop>)
-    {
-        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_grid, a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize());
-        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_grid, b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize());
-        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_c_grid, c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetElementSpaceSize());
-
-        const auto GK0 = a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I0);
-
-        // divide block work by [GM10, GN10]
-        const auto c_gm10_gn10_block_cluster_idx =
-            c_grid_block_cluster_blockid_to_gm10_gn10.CalculateBottomIndex(
-                make_multi_index(get_block_1d_id()));
-
-        // HACK: this force index data into SGPR
-        const index_t igm10 = __builtin_amdgcn_readfirstlane(c_gm10_gn10_block_cluster_idx[I0]);
-        const index_t ign10 = __builtin_amdgcn_readfirstlane(c_gm10_gn10_block_cluster_idx[I1]);
-
-        // lds max alignment
-        // TODO: part of them should be moved into blockwise-gemm
-        // TODO: change this. I think it needs multi-dimensional alignment
-        constexpr auto max_lds_align = GK1;
-
-        // A matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
-            max_lds_align);
-
-        // B matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
-            max_lds_align);
-
-        // A matrix in LDS memory for blockwise GEMM
-        //   be careful of LDS alignment
-        constexpr auto a_block_desc_gk0_bm_gk1 = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<GK0PerBlock>{}, GM0 * Number<GM1PerBlockGM11>{}, GK1), max_lds_align);
-
-        // B matrix in LDS memory for blockwise GEMM
-        //   be careful of LDS alignment
-        constexpr auto b_block_desc_gk0_bn_gk1 = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<GK0PerBlock>{}, GN0 * Number<GN1PerBlockGN11>{}, GK1), max_lds_align);
-
-        static_assert(a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize() ==
-                              a_block_desc_gk0_bm_gk1.GetElementSpaceSize() &&
-                          b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize() ==
-                              b_block_desc_gk0_bn_gk1.GetElementSpaceSize(),
-                      "wrong!");
-
-        // A matrix blockwise copy
-        auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v5r1<
-            BlockSize,
-            InMemoryDataOperationEnum::Set,
-            Sequence<GK0PerBlock, GM0, 1, GM1PerBlockGM11, GK1.value>,
-            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferThreadClusterArrangeOrder,
-            FloatAB,
-            FloatAB,
-            decltype(a_grid_desc_gk0_gm0_gm10_gm11_gk1),
-            decltype(a_block_desc_gk0_gm0_gm10_gm11_gk1),
-            ABlockTransferSrcAccessOrder,
-            Sequence<0, 1, 2, 3, 4>,
-            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1, // SrcVectorTensorLengths
-            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1, // DstVectorTensorLengths
-            ABlockTransferSrcVectorTensorContiguousDimOrder, // SrcVectorTensorContiguousDimOrder
-            Sequence<0, 1, 2, 3, 4>,                         // DstVectorTensorContiguousDimOrder
-            false,
-            true>(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
-                  make_multi_index(0, 0, igm10, 0, 0),
-                  a_block_desc_gk0_gm0_gm10_gm11_gk1,
-                  make_multi_index(0, 0, 0, 0, 0));
-
-        // B matrix blockwise copy
-        auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v5r1<
-            BlockSize,
-            InMemoryDataOperationEnum::Set,
-            Sequence<GK0PerBlock, GN0, 1, GN1PerBlockGN11, GK1.value>,
-            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferThreadClusterArrangeOrder,
-            FloatAB,
-            FloatAB,
-            decltype(b_grid_desc_gk0_gn0_gn10_gn11_gk1),
-            decltype(b_block_desc_gk0_gn0_gn10_gn11_gk1),
-            BBlockTransferSrcAccessOrder,
-            Sequence<0, 1, 2, 3, 4>,
-            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1, // SrcVectorTensorLengths
-            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1, // DstVectorTensorLengths
-            BBlockTransferSrcVectorTensorContiguousDimOrder, // SrcVectorTensorContiguousDimOrder
-            Sequence<0, 1, 2, 3, 4>,                         // DstVectorTensorContiguousDimOrder
-            false,
-            true>(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
-                  make_multi_index(0, 0, ign10, 0, 0),
-                  b_block_desc_gk0_gn0_gn10_gn11_gk1,
-                  make_multi_index(0, 0, 0, 0, 0));
-
-        // GEMM definition
-        //   c_mtx += transpose(a_mtx) * b_mtx
-        //     a_mtx[GK0PerBlock, GM1PerBlockGM11] is in LDS
-        //     b_mtx[KPerBlocl, GN1PerBlockGN11] is in LDS
-        //     c_mtx[GM1PerBlockGM11, GN1PerBlockGN11] is distributed among threads, and saved in
-        //       register
-        const auto blockwise_gemm =
-            BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2<
-                BlockSize,
-                FloatAB,
-                FloatAB,
-                FloatAcc,
-                decltype(a_block_desc_gk0_bm_gk1),
-                decltype(b_block_desc_gk0_bn_gk1),
-                BM1PerThreadBM11,
-                BN1PerThreadBN11,
-                BK0PerThread,
-                BM10BN10ThreadClusterBM10Xs,
-                BM10BN10ThreadClusterBN10Xs,
-                BM1PerThreadBM11,
-                BN1PerThreadBN11>{};
-
-        constexpr auto c_thread_tensor_lengths_bm0_bm1_bn0_bn1 =
-            decltype(blockwise_gemm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1();
-
-        constexpr auto c_thread_desc_bm0_bm1_bn0_bn1 = make_naive_tensor_descriptor_packed(
-            sequence_to_tuple_of_number(c_thread_tensor_lengths_bm0_bm1_bn0_bn1));
-
-        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
-            a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize(), max_lds_align);
-
-        constexpr auto b_block_aligned_space_size = math::integer_least_multiple(
-            b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize(), max_lds_align);
-
-        FloatAB* p_a_block_double = p_shared_block;
-        FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size;
-
-        // register allocation for output
-        auto c_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAcc>(
-            c_thread_desc_bm0_bm1_bn0_bn1.GetElementSpaceSize());
-
-        ThreadwiseTensorSliceSet_v1<FloatAcc,
-                                    decltype(c_thread_desc_bm0_bm1_bn0_bn1),
-                                    decltype(c_thread_tensor_lengths_bm0_bm1_bn0_bn1)>{}
-            .Run(c_thread_desc_bm0_bm1_bn0_bn1,
-                 make_tuple(I0, I0, I0, I0),
-                 c_thread_buf,
-                 FloatAcc{0});
-
-        constexpr auto a_block_slice_copy_step = make_multi_index(GK0PerBlock, 0, 0, 0, 0);
-        constexpr auto b_block_slice_copy_step = make_multi_index(GK0PerBlock, 0, 0, 0, 0);
-
-        auto a_block_even_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            p_a_block_double, a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize());
-        auto b_block_even_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            p_b_block_double, b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize());
-
-        auto a_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            p_a_block_double + a_block_aligned_space_size,
-            a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize());
-        auto b_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            p_b_block_double + b_block_aligned_space_size,
-            b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize());
-
-        // LDS double buffer: preload data into LDS
-        {
-            a_blockwise_copy.RunRead(
-                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
-            b_blockwise_copy.RunRead(
-                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
-
-            a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_even_buf);
-            b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_even_buf);
-        }
-
-        if constexpr(HasMainKBlockLoop)
-        {
-            index_t gk0_block_on_grid = 0;
-
-            // LDS double buffer: main body
-            // use Do-While loop instead of For loop to simplify control flow
-            do
-            {
-                // even iteration
-                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
-                                                    a_block_slice_copy_step,
-                                                    AGridMoveSliceWindowStepHacks{});
-                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
-                                                    b_block_slice_copy_step,
-                                                    BGridMoveSliceWindowStepHacks{});
-
-                __syncthreads();
-
-                // LDS doubel buffer: load next data from device mem
-                a_blockwise_copy.RunRead(
-                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
-                b_blockwise_copy.RunRead(
-                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
-
-                // LDS double buffer: GEMM on current data
-                blockwise_gemm.Run(c_thread_desc_bm0_bm1_bn0_bn1,
-                                   a_block_even_buf,
-                                   b_block_even_buf,
-                                   c_thread_buf);
-
-                // LDS double buffer: store next data to LDS
-                a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_odd_buf);
-                b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_odd_buf);
-
-                // odd iteration
-                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
-                                                    a_block_slice_copy_step,
-                                                    AGridMoveSliceWindowStepHacks{});
-                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
-                                                    b_block_slice_copy_step,
-                                                    BGridMoveSliceWindowStepHacks{});
-
-                __syncthreads();
-
-                // LDS doubel buffer: load next data from device mem
-                a_blockwise_copy.RunRead(
-                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
-                b_blockwise_copy.RunRead(
-                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
-
-                // LDS double buffer: GEMM on current data
-                blockwise_gemm.Run(
-                    c_thread_desc_bm0_bm1_bn0_bn1, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
-
-                // LDS double buffer: store next data to LDS
-                a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_even_buf);
-                b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_even_buf);
-
-                gk0_block_on_grid += 2 * GK0PerBlock;
-            } while(gk0_block_on_grid < GK0 - 2 * GK0PerBlock);
-        }
-
-        // LDS double buffer: tail
-        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
-        {
-            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
-                                                a_block_slice_copy_step,
-                                                AGridMoveSliceWindowStepHacks{});
-            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
-                                                b_block_slice_copy_step,
-                                                BGridMoveSliceWindowStepHacks{});
-
-            __syncthreads();
-
-            // LDS double buffer: load last data from device mem
-            a_blockwise_copy.RunRead(
-                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
-            b_blockwise_copy.RunRead(
-                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
-
-            // LDS double buffer: GEMM on 2nd-last data
-            blockwise_gemm.Run(
-                c_thread_desc_bm0_bm1_bn0_bn1, a_block_even_buf, b_block_even_buf, c_thread_buf);
-
-            // LDS double buffer: store last data to LDS
-            a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_odd_buf);
-            b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_odd_buf);
-
-            __syncthreads();
-
-            // LDS double buffer: GEMM on last data
-            blockwise_gemm.Run(
-                c_thread_desc_bm0_bm1_bn0_bn1, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
-        }
-        else // if has 1 iteration left
-        {
-            __syncthreads();
-
-            // LDS double buffer: GEMM on last data
-            blockwise_gemm.Run(
-                c_thread_desc_bm0_bm1_bn0_bn1, a_block_even_buf, b_block_even_buf, c_thread_buf);
-        }
-
-        // output: register to global memory
-        {
-            constexpr auto c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1 =
-                make_naive_tensor_descriptor_packed(
-                    make_tuple(I1,
-                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I0]>{},
-                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I1]>{},
-                               I1,
-                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I2]>{},
-                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I3]>{}));
-
-            const auto c_thread_origin_on_block_bm0_bm1_bn0_bn1 =
-                blockwise_gemm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
-                    get_thread_local_1d_id());
-
-            ThreadwiseTensorSliceTransfer_v1r3<
-                FloatAcc,
-                FloatC,
-                decltype(c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1),
-                decltype(c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1),
-                Sequence<1,
-                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I0],
-                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I1],
-                         1,
-                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I2],
-                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I3]>,
-                CThreadTransferSrcDstAccessOrder,
-                CThreadTransferSrcDstVectorDim,
-                CThreadTransferDstScalarPerVector,
-                CGlobalMemoryDataOperation,
-                1,
-                false>{c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
-                       make_multi_index(igm10,
-                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I0],
-                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I1],
-                                        ign10,
-                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I2],
-                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I3])}
-                .Run(c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1,
-                     make_tuple(I0, I0, I0, I0, I0, I0),
-                     c_thread_buf,
-                     c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
-                     c_grid_buf,
-                     CGridStepHacks{});
-        }
-    }
-};
-
-} // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
deleted file mode 100644
index 84e033e1e..000000000
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
+++ /dev/null
@@ -1,608 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
-#define CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
-
-#include "common_header.hpp"
-#include "multi_index_transform_helper.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "blockwise_gemm_dlops_v2r2.hpp"
-#include "blockwise_tensor_slice_transfer.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
-#include "threadwise_tensor_slice_set.hpp"
-
-namespace ck {
-
-template <typename GridwiseGemm,
-          typename FloatAB,
-          typename FloatC,
-          typename AKM0M1GridDesc,
-          typename BKN0N1GridDesc,
-          typename CM0M10M11N0N10N11GridDesc,
-          typename CBlockIdToM0N0BlockClusterAdaptor,
-          bool HasMainKBlockLoop,
-          bool HasDoubleTailKBlockLoop>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        kernel_gemm_dlops_v1r2(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const AKM0M1GridDesc a_k_m0_m1_grid_desc,
-            const BKN0N1GridDesc b_k_n0_n1_grid_desc,
-            const CM0M10M11N0N10N11GridDesc c_m0_m10_m11_n0_n10_n11_grid_desc,
-            const CBlockIdToM0N0BlockClusterAdaptor cblockid_to_m0_n0_block_cluster_adaptor)
-{
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseGemm::Run(p_a_grid,
-                      p_b_grid,
-                      p_c_grid,
-                      p_shared_block,
-                      a_k_m0_m1_grid_desc,
-                      b_k_n0_n1_grid_desc,
-                      c_m0_m10_m11_n0_n10_n11_grid_desc,
-                      cblockid_to_m0_n0_block_cluster_adaptor,
-                      integral_constant<bool, HasMainKBlockLoop>{},
-                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
-}
-
-template <index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AKMGridDesc,
-          typename BKNGridDesc,
-          typename CMNGridDesc,
-          index_t MPerBlockM1,
-          index_t NPerBlockN1,
-          index_t KPerBlock,
-          index_t M1PerThreadM111,
-          index_t N1PerThreadN111,
-          index_t KPerThread,
-          index_t M11N11ThreadClusterM1100,
-          index_t M11N11ThreadClusterN1100,
-          index_t M11N11ThreadClusterM1101,
-          index_t M11N11ThreadClusterN1101,
-          typename ABlockTransferThreadSliceLengths_K_M0_M1,
-          typename ABlockTransferThreadClusterLengths_K_M0_M1,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          index_t ABlockTransferSrcVectorDim,
-          index_t ABlockTransferSrcScalarPerVector,
-          index_t ABlockTransferDstScalarPerVector_M1,
-          bool AThreadTransferSrcResetCoordinateAfterRun,
-          typename BBlockTransferThreadSliceLengths_K_N0_N1,
-          typename BBlockTransferThreadClusterLengths_K_N0_N1,
-          typename BBlockTransferThreadClusterArrangeOrder,
-          typename BBlockTransferSrcAccessOrder,
-          index_t BBlockTransferSrcVectorDim,
-          index_t BBlockTransferSrcScalarPerVector,
-          index_t BBlockTransferDstScalarPerVector_N1,
-          bool BThreadTransferSrcResetCoordinateAfterRun,
-          typename CThreadTransferSrcDstAccessOrder,
-          index_t CThreadTransferSrcDstVectorDim,
-          index_t CThreadTransferDstScalarPerVector,
-          typename AGridStepHacks,
-          typename BGridStepHacks,
-          typename CGridStepHacks,
-          typename AGridMoveSliceWindowStepHacks,
-          typename BGridMoveSliceWindowStepHacks>
-struct GridwiseGemmDlops_km_kn_mn_v1r2
-{
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-    static constexpr auto I2 = Number<2>{};
-    static constexpr auto I3 = Number<3>{};
-
-    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
-    {
-        constexpr auto max_lds_align = math::lcm(Number<ABlockTransferDstScalarPerVector_M1>{},
-                                                 Number<BBlockTransferDstScalarPerVector_N1>{},
-                                                 Number<M1PerThreadM111>{},
-                                                 Number<N1PerThreadN111>{});
-
-        // A matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
-
-        // B matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
-
-        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_aligned_space_size =
-            math::integer_least_multiple(a_k_m_block_desc.GetElementSpaceSize(), max_lds_align);
-
-        constexpr auto b_block_aligned_space_size =
-            math::integer_least_multiple(b_k_n_block_desc.GetElementSpaceSize(), max_lds_align);
-
-        return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
-    }
-
-    __host__ __device__ static constexpr bool CheckValidity(const AKMGridDesc& a_k_m_grid_desc,
-                                                            const BKNGridDesc& b_k_n_grid_desc,
-                                                            const CMNGridDesc& c_m_n_grid_desc)
-    {
-        const auto M = a_k_m_grid_desc.GetLength(I1);
-        const auto N = b_k_n_grid_desc.GetLength(I1);
-        const auto K = a_k_m_grid_desc.GetLength(I0);
-
-        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
-
-        return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
-                K == b_k_n_grid_desc.GetLength(I0)) &&
-               (M % MPerBlockM1 == 0 && N % NPerBlockN1 == 0 && K % KPerBlock == 0);
-    }
-
-    __host__ __device__ static constexpr index_t CalculateGridSize(index_t M, index_t N)
-    {
-        const index_t grid_size = (M / MPerBlockM1) * (N / NPerBlockN1);
-
-        return grid_size;
-    }
-
-    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
-    {
-        const bool has_main_k_block_loop = (K + KPerBlock) / (2 * KPerBlock) > 1;
-
-        return has_main_k_block_loop;
-    }
-
-    __host__ __device__ static constexpr bool CalculateHasDoubleTailKBlockLoop(index_t K)
-    {
-        const bool has_double_tail_k_block_loop = (K / KPerBlock) % 2 == 0;
-
-        return has_double_tail_k_block_loop;
-    }
-
-    __host__ __device__ static constexpr auto
-    MakeAKM0M1GridDescriptor(const AKMGridDesc& a_k_m_grid_desc)
-    {
-        const auto K = a_k_m_grid_desc.GetLength(I0);
-        const auto M = a_k_m_grid_desc.GetLength(I1);
-
-        const auto M1 = Number<MPerBlockM1>{};
-        const auto M0 = M / M1;
-
-        const auto a_k_m0_m1_grid_desc = transform_tensor_descriptor(
-            a_k_m_grid_desc,
-            make_tuple(make_pass_through_transform(K), make_unmerge_transform(make_tuple(M0, M1))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1, 2>{}));
-
-        return a_k_m0_m1_grid_desc;
-    }
-
-    __host__ __device__ static constexpr auto
-    MakeBKN0N1GridDescriptor(const BKNGridDesc& b_k_n_grid_desc)
-    {
-        const auto K = b_k_n_grid_desc.GetLength(I0);
-        const auto N = b_k_n_grid_desc.GetLength(I1);
-
-        const auto N1 = Number<NPerBlockN1>{};
-        const auto N0 = N / N1;
-
-        const auto b_k_n0_n1_grid_desc = transform_tensor_descriptor(
-            b_k_n_grid_desc,
-            make_tuple(make_pass_through_transform(K), make_unmerge_transform(make_tuple(N0, N1))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1, 2>{}));
-
-        return b_k_n0_n1_grid_desc;
-    }
-
-    __host__ __device__ static constexpr auto
-    MakeCM0M10M11N0N10N11GridDescriptor(const CMNGridDesc& c_m_n_grid_desc)
-    {
-        const auto M = c_m_n_grid_desc.GetLength(I0);
-        const auto N = c_m_n_grid_desc.GetLength(I1);
-
-        constexpr auto M1 = Number<MPerBlockM1>{};
-        constexpr auto N1 = Number<NPerBlockN1>{};
-
-        const auto M0 = M / M1;
-        const auto N0 = N / N1;
-
-        constexpr auto M11 =
-            Number<M11N11ThreadClusterM1100 * M11N11ThreadClusterM1101 * M1PerThreadM111>{};
-        constexpr auto N11 =
-            Number<M11N11ThreadClusterN1100 * M11N11ThreadClusterN1101 * N1PerThreadN111>{};
-
-        constexpr auto M10 = M1 / M11;
-        constexpr auto N10 = N1 / N11;
-
-        const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_tensor_descriptor(
-            c_m_n_grid_desc,
-            make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)),
-                       make_unmerge_transform(make_tuple(N0, N10, N11))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}));
-
-        return c_m0_m10_m11_n0_n10_n11_grid_desc;
-    }
-
-    __host__ __device__ static constexpr auto
-    MakeCBlockIdToM0N0BlockClusterAdaptor(const CMNGridDesc& c_m_n_grid_desc)
-    {
-        const auto M = c_m_n_grid_desc.GetLength(I0);
-        const auto N = c_m_n_grid_desc.GetLength(I1);
-
-        constexpr auto M1 = Number<MPerBlockM1>{};
-        constexpr auto N1 = Number<NPerBlockN1>{};
-
-        const auto M0 = M / M1;
-        const auto N0 = N / N1;
-
-        const auto cblockid_to_m0_n0_block_cluster_adaptor =
-            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(M0, N0))),
-                                             make_tuple(Sequence<0, 1>{}),
-                                             make_tuple(Sequence<0>{}));
-
-        return cblockid_to_m0_n0_block_cluster_adaptor;
-    }
-
-    using AKM0M1GridDesc            = decltype(MakeAKM0M1GridDescriptor(AKMGridDesc{}));
-    using BKN0N1GridDesc            = decltype(MakeBKN0N1GridDescriptor(BKNGridDesc{}));
-    using CM0M10M11N0N10N11GridDesc = decltype(MakeCM0M10M11N0N10N11GridDescriptor(CMNGridDesc{}));
-    using CBlockIdToM0N0BlockClusterAdaptor =
-        decltype(MakeCBlockIdToM0N0BlockClusterAdaptor(CMNGridDesc{}));
-
-    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
-    __device__ static void
-    Run(const FloatAB* __restrict__ p_a_grid,
-        const FloatAB* __restrict__ p_b_grid,
-        FloatC* __restrict__ p_c_grid,
-        FloatAB* __restrict__ p_shared_block,
-        const AKM0M1GridDesc& a_k_m0_m1_grid_desc,
-        const BKN0N1GridDesc& b_k_n0_n1_grid_desc,
-        const CM0M10M11N0N10N11GridDesc& c_m0_m10_m11_n0_n10_n11_grid_desc,
-        const CBlockIdToM0N0BlockClusterAdaptor& cblockid_to_m0_n0_block_cluster_adaptor,
-        integral_constant<bool, HasMainKBlockLoop>,
-        integral_constant<bool, HasDoubleTailKBlockLoop>)
-    {
-        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_grid, a_k_m0_m1_grid_desc.GetElementSpaceSize());
-        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_grid, b_k_n0_n1_grid_desc.GetElementSpaceSize());
-        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_c_grid, c_m0_m10_m11_n0_n10_n11_grid_desc.GetElementSpaceSize());
-
-        const auto K = a_k_m0_m1_grid_desc.GetLength(I0);
-
-        // divide block work by [M, N]
-        const auto c_m0_n0_block_cluster_idx =
-            cblockid_to_m0_n0_block_cluster_adaptor.CalculateBottomIndex(
-                make_multi_index(get_block_1d_id()));
-
-        // HACK: this force index data into SGPR
-        const index_t im0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I0]);
-        const index_t in0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I1]);
-
-        // lds max alignment
-        constexpr auto max_lds_align = math::lcm(Number<ABlockTransferDstScalarPerVector_M1>{},
-                                                 Number<BBlockTransferDstScalarPerVector_N1>{},
-                                                 Number<M1PerThreadM111>{},
-                                                 Number<N1PerThreadN111>{});
-
-        // A matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
-
-        // B matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
-
-        // A matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto a_k_m0_m1_block_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}), max_lds_align);
-
-        // B matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto b_k_n0_n1_block_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}), max_lds_align);
-
-        // A matrix blockwise copy
-        auto a_blockwise_copy =
-            BlockwiseTensorSliceTransfer_v4<BlockSize,
-                                            InMemoryDataOperationEnum::Set,
-                                            Sequence<KPerBlock, 1, MPerBlockM1>,
-                                            ABlockTransferThreadSliceLengths_K_M0_M1,
-                                            ABlockTransferThreadClusterLengths_K_M0_M1,
-                                            ABlockTransferThreadClusterArrangeOrder,
-                                            FloatAB,
-                                            FloatAB,
-                                            decltype(a_k_m0_m1_grid_desc),
-                                            decltype(a_k_m0_m1_block_desc),
-                                            ABlockTransferSrcAccessOrder,
-                                            Sequence<0, 1, 2>,
-                                            ABlockTransferSrcVectorDim,
-                                            2,
-                                            ABlockTransferSrcScalarPerVector,
-                                            ABlockTransferDstScalarPerVector_M1,
-                                            1,
-                                            1,
-                                            AThreadTransferSrcResetCoordinateAfterRun,
-                                            true>(a_k_m0_m1_grid_desc,
-                                                  make_multi_index(0, im0, 0),
-                                                  a_k_m0_m1_block_desc,
-                                                  make_multi_index(0, 0, 0));
-
-        // B matrix blockwise copy
-        auto b_blockwise_copy =
-            BlockwiseTensorSliceTransfer_v4<BlockSize,
-                                            InMemoryDataOperationEnum::Set,
-                                            Sequence<KPerBlock, 1, NPerBlockN1>,
-                                            BBlockTransferThreadSliceLengths_K_N0_N1,
-                                            BBlockTransferThreadClusterLengths_K_N0_N1,
-                                            BBlockTransferThreadClusterArrangeOrder,
-                                            FloatAB,
-                                            FloatAB,
-                                            decltype(b_k_n0_n1_grid_desc),
-                                            decltype(b_k_n0_n1_block_desc),
-                                            BBlockTransferSrcAccessOrder,
-                                            Sequence<0, 1, 2>,
-                                            BBlockTransferSrcVectorDim,
-                                            2,
-                                            BBlockTransferSrcScalarPerVector,
-                                            BBlockTransferDstScalarPerVector_N1,
-                                            1,
-                                            1,
-                                            BThreadTransferSrcResetCoordinateAfterRun,
-                                            true>(b_k_n0_n1_grid_desc,
-                                                  make_multi_index(0, in0, 0),
-                                                  b_k_n0_n1_block_desc,
-                                                  make_multi_index(0, 0, 0));
-
-        // GEMM definition
-        //   c_mtx += transpose(a_mtx) * b_mtx
-        //     a_mtx[KPerBlock, MPerBlockM1] is in LDS
-        //     b_mtx[KPerBlocl, NPerBlockN1] is in LDS
-        //     c_mtx[MPerBlockM1, NPerBlockN1] is distributed among threads, and saved in
-        //       register
-        const auto blockwise_gemm =
-            BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2<BlockSize,
-                                                                FloatAB,
-                                                                FloatAB,
-                                                                FloatAcc,
-                                                                decltype(a_k_m_block_desc),
-                                                                decltype(b_k_n_block_desc),
-                                                                M1PerThreadM111,
-                                                                N1PerThreadN111,
-                                                                KPerThread,
-                                                                M11N11ThreadClusterM1100,
-                                                                M11N11ThreadClusterN1100,
-                                                                M11N11ThreadClusterM1101,
-                                                                M11N11ThreadClusterN1101,
-                                                                M1PerThreadM111,
-                                                                N1PerThreadN111>{};
-        constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths =
-            decltype(blockwise_gemm)::GetCM0M1N0N1ThreadTensorLengths();
-
-        constexpr auto c_m10_m11_n10_n11_thread_desc = make_naive_tensor_descriptor_packed(
-            sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
-
-        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_aligned_space_size =
-            math::integer_least_multiple(a_k_m0_m1_block_desc.GetElementSpaceSize(), max_lds_align);
-
-        constexpr auto b_block_aligned_space_size =
-            math::integer_least_multiple(b_k_n0_n1_block_desc.GetElementSpaceSize(), max_lds_align);
-
-        FloatAB* p_a_block_double = p_shared_block;
-        FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size;
-
-        // register allocation for output
-        auto c_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAcc>(
-            c_m10_m11_n10_n11_thread_desc.GetElementSpaceSize());
-
-        ThreadwiseTensorSliceSet_v1<FloatAcc,
-                                    decltype(c_m10_m11_n10_n11_thread_desc),
-                                    decltype(c_m10_m11_n10_n11_thread_tensor_lengths)>{}
-            .Run(c_m10_m11_n10_n11_thread_desc,
-                 make_tuple(I0, I0, I0, I0),
-                 c_thread_buf,
-                 FloatAcc{0});
-
-        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
-        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
-
-        // hack to control index calculation when iterating over A and B matrix for threadwise copy
-        constexpr auto a_k_m0_m1_global_step_hacks = AGridStepHacks{};
-        constexpr auto b_k_n0_n1_global_step_hacks = BGridStepHacks{};
-
-        // hack to control index calculation when move slice window for A and B matrix for
-        // threadwise copy
-        constexpr auto a_k_m0_m1_global_move_slice_window_step_hack =
-            AGridMoveSliceWindowStepHacks{};
-        constexpr auto b_k_n0_n1_global_move_slice_window_step_hack =
-            BGridMoveSliceWindowStepHacks{};
-
-        auto a_block_even_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            p_a_block_double, a_k_m0_m1_block_desc.GetElementSpaceSize());
-        auto b_block_even_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            p_b_block_double, b_k_n0_n1_block_desc.GetElementSpaceSize());
-
-        auto a_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            p_a_block_double + a_block_aligned_space_size,
-            a_k_m0_m1_block_desc.GetElementSpaceSize());
-        auto b_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            p_b_block_double + b_block_aligned_space_size,
-            b_k_n0_n1_block_desc.GetElementSpaceSize());
-
-        // LDS double buffer: preload data into LDS
-        {
-            a_blockwise_copy.RunRead(
-                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
-            b_blockwise_copy.RunRead(
-                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
-
-            a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_even_buf);
-            b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_even_buf);
-        }
-
-        if constexpr(HasMainKBlockLoop)
-        {
-            index_t k_block_data_begin = 0;
-
-            // LDS double buffer: main body
-            // use Do-While loop instead of For loop to simplify control flow
-            do
-            {
-                // even iteration
-                a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
-                                                    a_block_slice_copy_step,
-                                                    a_k_m0_m1_global_move_slice_window_step_hack);
-                b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
-                                                    b_block_slice_copy_step,
-                                                    b_k_n0_n1_global_move_slice_window_step_hack);
-
-                __syncthreads();
-
-                // LDS doubel buffer: load next data from device mem
-                a_blockwise_copy.RunRead(
-                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
-                b_blockwise_copy.RunRead(
-                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
-
-                // LDS double buffer: GEMM on current data
-                blockwise_gemm.Run(c_m10_m11_n10_n11_thread_desc,
-                                   a_block_even_buf,
-                                   b_block_even_buf,
-                                   c_thread_buf);
-
-                // LDS double buffer: store next data to LDS
-                a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_odd_buf);
-                b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_odd_buf);
-
-                // odd iteration
-                a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
-                                                    a_block_slice_copy_step,
-                                                    a_k_m0_m1_global_move_slice_window_step_hack);
-                b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
-                                                    b_block_slice_copy_step,
-                                                    b_k_n0_n1_global_move_slice_window_step_hack);
-
-                __syncthreads();
-
-                // LDS doubel buffer: load next data from device mem
-                a_blockwise_copy.RunRead(
-                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
-                b_blockwise_copy.RunRead(
-                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
-
-                // LDS double buffer: GEMM on current data
-                blockwise_gemm.Run(
-                    c_m10_m11_n10_n11_thread_desc, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
-
-                // LDS double buffer: store next data to LDS
-                a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_even_buf);
-                b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_even_buf);
-
-                k_block_data_begin += 2 * KPerBlock;
-            } while(k_block_data_begin < K - 2 * KPerBlock);
-        }
-
-        // LDS double buffer: tail
-        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
-        {
-            a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
-                                                a_block_slice_copy_step,
-                                                a_k_m0_m1_global_move_slice_window_step_hack);
-            b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
-                                                b_block_slice_copy_step,
-                                                b_k_n0_n1_global_move_slice_window_step_hack);
-
-            __syncthreads();
-
-            // LDS double buffer: load last data from device mem
-            a_blockwise_copy.RunRead(
-                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
-            b_blockwise_copy.RunRead(
-                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
-
-            // LDS double buffer: GEMM on 2nd-last data
-            blockwise_gemm.Run(
-                c_m10_m11_n10_n11_thread_desc, a_block_even_buf, b_block_even_buf, c_thread_buf);
-
-            // LDS double buffer: store last data to LDS
-            a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_odd_buf);
-            b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_odd_buf);
-
-            __syncthreads();
-
-            // LDS double buffer: GEMM on last data
-            blockwise_gemm.Run(
-                c_m10_m11_n10_n11_thread_desc, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
-        }
-        else // if has 1 iteration left
-        {
-            __syncthreads();
-
-            // LDS double buffer: GEMM on last data
-            blockwise_gemm.Run(
-                c_m10_m11_n10_n11_thread_desc, a_block_even_buf, b_block_even_buf, c_thread_buf);
-        }
-
-        // output: register to global memory
-        {
-            constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
-                make_naive_tensor_descriptor_packed(
-                    make_tuple(I1,
-                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I0]>{},
-                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I1]>{},
-                               I1,
-                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I2]>{},
-                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I3]>{}));
-
-            const auto c_m10_m11_n10_n11_thread_origin_idx_on_block =
-                blockwise_gemm.CalculateCM0M1N0N1ThreadOriginOnBlock(get_thread_local_1d_id());
-
-            ThreadwiseTensorSliceTransfer_v1r3<
-                FloatAcc,
-                FloatC,
-                decltype(c_m0_m10_m11_n0_n10_n11_thread_desc),
-                decltype(c_m0_m10_m11_n0_n10_n11_grid_desc),
-                Sequence<1,
-                         c_m10_m11_n10_n11_thread_tensor_lengths[I0],
-                         c_m10_m11_n10_n11_thread_tensor_lengths[I1],
-                         1,
-                         c_m10_m11_n10_n11_thread_tensor_lengths[I2],
-                         c_m10_m11_n10_n11_thread_tensor_lengths[I3]>,
-                CThreadTransferSrcDstAccessOrder,
-                CThreadTransferSrcDstVectorDim,
-                CThreadTransferDstScalarPerVector,
-                CGlobalMemoryDataOperation,
-                1,
-                true>{c_m0_m10_m11_n0_n10_n11_grid_desc,
-                      make_multi_index(im0,
-                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I0],
-                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I1],
-                                       in0,
-                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I2],
-                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I3])}
-                .Run(c_m0_m10_m11_n0_n10_n11_thread_desc,
-                     make_tuple(I0, I0, I0, I0, I0, I0),
-                     c_thread_buf,
-                     c_m0_m10_m11_n0_n10_n11_grid_desc,
-                     c_grid_buf,
-                     CGridStepHacks{});
-        }
-    }
-};
-
-} // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp
deleted file mode 100644
index b1dfb0c73..000000000
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp
+++ /dev/null
@@ -1,461 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_GRIDWISE_GEMM_V2_HPP
-#define CK_GRIDWISE_GEMM_V2_HPP
-
-#include "common_header.hpp"
-#include "multi_index_transform_helper.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "blockwise_tensor_slice_transfer.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
-#include "blockwise_gemm_dlops_v3.hpp"
-
-namespace ck {
-
-template <index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AGlobalDesc,
-          typename BGlobalDesc,
-          typename CGlobalDesc,
-          index_t KPerBlock,
-          index_t HoPerBlock,
-          index_t WoPerBlock,
-          index_t EPerBlock,
-          index_t KPerThread,
-          index_t HoPerThread,
-          index_t WoPerThread,
-          index_t EPerThread,
-          typename ABlockTransferThreadSliceLengths_E_K,
-          typename ABlockTransferThreadClusterLengths_E_K,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          index_t ABlockTransferSrcVectorDim,
-          index_t ABlockTransferSrcScalarPerVector,
-          index_t ABlockTransferDstScalarPerVector_K,
-          bool AThreadTransferSrcResetCoordinateAfterRun,
-          typename BBlockTransferSrcAccessOrder,
-          index_t BBlockTransferSrcVectorDim,
-          index_t BBlockTransferSrcScalarPerVector,
-          bool BThreadTransferSrcResetCoordinateAfterRun,
-          typename CThreadTransferSrcDstAccessOrder,
-          index_t CThreadTransferSrcDstVectorDim,
-          index_t CThreadTransferDstScalarPerVector,
-          typename AGlobalStepHacks,
-          typename BGlobalStepHacks,
-          typename CGlobalStepHacks,
-          typename AGlobalMoveSliceWindowStepHacks,
-          typename BGlobalMoveSliceWindowStepHacks>
-struct GridwiseGemmDlops_km_kn_mn_v3
-{
-    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
-    {
-        constexpr auto E = EPerBlock * 3 * 3;
-
-        constexpr auto max_lds_align =
-            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
-
-        // A matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
-
-        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_space_size =
-            math::integer_least_multiple(a_e_k_desc.GetElementSpaceSize(), max_lds_align);
-
-        return a_block_space_size * sizeof(FloatAB);
-    }
-
-    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
-    __device__ void Run(const AGlobalDesc& a_e_k_global_desc,
-                        const FloatAB* __restrict__ p_a_global,
-                        const BGlobalDesc& b_e_n_ho_wo_global_desc,
-                        const FloatAB* __restrict__ p_b_global,
-                        const CGlobalDesc& c_k_n_ho_wo_global_desc,
-                        FloatC* __restrict__ p_c_global,
-                        FloatAB* __restrict__ p_shared_block,
-                        integral_constant<bool, HasMainKBlockLoop>,
-                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_global, a_e_k_global_desc.GetElementSpaceSize());
-        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_global, b_e_n_ho_wo_global_desc.GetElementSpaceSize());
-        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize());
-
-        constexpr auto E = EPerBlock * 3 * 3;
-
-        // const auto E = a_e_k_global_desc.GetLength(I0);
-        const auto K = a_e_k_global_desc.GetLength(I1);
-
-        const auto N  = b_e_n_ho_wo_global_desc.GetLength(I1);
-        const auto Ho = b_e_n_ho_wo_global_desc.GetLength(I2);
-        const auto Wo = b_e_n_ho_wo_global_desc.GetLength(I3);
-
-// divide block work by [M, N]
-#if 0
-        const auto ho_block_work_num  = Ho / Number<HoPerBlock>{};
-        const auto wo_block_work_num  = Wo / Number<WoPerBlock>{};
-        const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
-
-        const index_t k_block_work_id   = get_block_1d_id() / hwo_block_work_num;
-        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
-
-        const index_t ho_block_work_id = hwo_block_work_id / wo_block_work_num;
-        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
-#else
-        // Hack: this force result into SGPR
-        const index_t ho_block_work_num  = __builtin_amdgcn_readfirstlane(Ho / HoPerBlock);
-        const index_t wo_block_work_num  = __builtin_amdgcn_readfirstlane(Wo / WoPerBlock);
-        const index_t hwo_block_work_num = ho_block_work_num * wo_block_work_num;
-
-        const index_t k_block_work_id =
-            __builtin_amdgcn_readfirstlane(get_block_1d_id() / hwo_block_work_num);
-        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
-
-        const index_t ho_block_work_id =
-            __builtin_amdgcn_readfirstlane(hwo_block_work_id / wo_block_work_num);
-        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
-#endif
-
-        // lds max alignment
-        constexpr auto max_lds_align =
-            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
-
-        // A matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto a_e_k_block_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
-
-        constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
-
-        // B matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto b_e_n_ho_wo_block_desc = make_naive_tensor_descriptor_packed(make_tuple(
-            Number<EPerBlock>{}, Number<1>{}, Number<HoPerBlock>{}, Number<WoPerBlock>{}));
-
-        // c_thread_mtx definition: this is a mess
-        // TODO:: more elegent way of defining c_thread_mtx
-        constexpr auto c_k_n_ho_wo_thread_desc = make_naive_tensor_descriptor_packed(make_tuple(
-            Number<KPerThread>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
-
-        auto blockwise_gemm =
-            BlockwiseGemmDlops_km_kn_m0m1n0n1_v3<BlockSize,
-                                                 FloatAB,
-                                                 FloatAB,
-                                                 FloatAcc,
-                                                 decltype(a_e_k_block_desc),
-                                                 decltype(b_e_n_ho_wo_block_desc),
-                                                 decltype(c_k_n_ho_wo_thread_desc),
-                                                 KPerThread,
-                                                 HoPerThread,
-                                                 WoPerThread,
-                                                 EPerThread,
-                                                 ABlockTransferSrcScalarPerVector,
-                                                 ABlockTransferDstScalarPerVector_K>{};
-
-        auto c_thread_mtx_index = blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
-
-        const auto k_thread_id  = c_thread_mtx_index.k;
-        const auto ho_thread_id = c_thread_mtx_index.h;
-        const auto wo_thread_id = c_thread_mtx_index.w;
-
-        const index_t k_block_data_on_global  = k_block_work_id * KPerBlock;
-        const index_t ho_block_data_on_global = ho_block_work_id * HoPerBlock;
-        const index_t wo_block_data_on_global = wo_block_work_id * WoPerBlock;
-
-        const index_t ho_thread_data_on_global =
-            ho_block_data_on_global + ho_thread_id * HoPerThread;
-        const index_t wo_thread_data_on_global =
-            wo_block_data_on_global + wo_thread_id * WoPerThread;
-
-        // A matrix blockwise copy
-        auto a_blockwise_copy =
-            BlockwiseTensorSliceTransfer_v4<BlockSize,
-                                            InMemoryDataOperationEnum::Set,
-                                            Sequence<E, KPerBlock>,
-                                            ABlockTransferThreadSliceLengths_E_K,
-                                            ABlockTransferThreadClusterLengths_E_K,
-                                            ABlockTransferThreadClusterArrangeOrder,
-                                            FloatAB,
-                                            FloatAB,
-                                            decltype(a_e_k_global_desc),
-                                            decltype(a_e_k_desc),
-                                            ABlockTransferSrcAccessOrder,
-                                            Sequence<0, 1>,
-                                            ABlockTransferSrcVectorDim,
-                                            1,
-                                            ABlockTransferSrcScalarPerVector,
-                                            ABlockTransferDstScalarPerVector_K,
-                                            1,
-                                            1,
-                                            AThreadTransferSrcResetCoordinateAfterRun,
-                                            true>(a_e_k_global_desc,
-                                                  make_multi_index(0, k_block_data_on_global),
-                                                  a_e_k_desc,
-                                                  make_multi_index(0, 0));
-
-        constexpr auto b_e_n_ho_wo_thread_desc = make_naive_tensor_descriptor_packed(make_tuple(
-            Number<EPerBlock>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
-
-        auto b_threadwise_transfer =
-            ThreadwiseTensorSliceTransfer_v2<FloatAB,
-                                             FloatAB,
-                                             decltype(b_e_n_ho_wo_global_desc),
-                                             decltype(b_e_n_ho_wo_thread_desc),
-                                             Sequence<EPerBlock, 1, HoPerThread, WoPerThread>,
-                                             BBlockTransferSrcAccessOrder,
-                                             BBlockTransferSrcVectorDim,
-                                             BBlockTransferSrcScalarPerVector,
-                                             1,
-                                             true>(
-                b_e_n_ho_wo_global_desc,
-                make_multi_index(0, 0, ho_thread_data_on_global, wo_thread_data_on_global));
-
-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            p_shared_block, a_e_k_desc.GetElementSpaceSize());
-
-        // register allocation for output
-        StaticBuffer<AddressSpaceEnum::Vgpr,
-                     FloatAcc,
-                     c_k_n_ho_wo_thread_desc.GetElementSpaceSize(),
-                     true>
-            c_thread_buf;
-
-        // initialize output thread tensor
-        ThreadwiseTensorSliceSet_v1<FloatAcc,
-                                    decltype(c_k_n_ho_wo_thread_desc),
-                                    Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
-            .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
-
-        constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0);
-
-        // hack to control index calculation when iterating over A and B matrix for threadwise copy
-        constexpr auto a_e_k_global_step_hacks       = AGlobalStepHacks{};
-        constexpr auto b_e_n_ho_wo_global_step_hacks = BGlobalStepHacks{};
-
-        // hack to control index calculation when move slice window for A and B matrix for
-        // threadwise copy
-        constexpr auto a_e_k_global_move_slice_window_step_hack = AGlobalMoveSliceWindowStepHacks{};
-        constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
-            BGlobalMoveSliceWindowStepHacks{};
-
-        // double regsiter buffer for b
-        StaticBuffer<AddressSpaceEnum::Vgpr,
-                     FloatAB,
-                     b_e_n_ho_wo_thread_desc.GetElementSpaceSize(),
-                     true>
-            b_thread_even_buf, b_thread_odd_buf;
-
-        // LDS double buffer: preload data
-        {
-            a_blockwise_copy.RunRead(a_e_k_global_desc, a_global_buf, a_e_k_global_step_hacks);
-
-            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
-                                      b_global_buf,
-                                      b_e_n_ho_wo_thread_desc,
-                                      make_tuple(I0, I0, I0, I0),
-                                      b_thread_even_buf,
-                                      b_e_n_ho_wo_global_step_hacks);
-
-            a_blockwise_copy.RunWrite(a_e_k_desc, a_block_buf);
-        }
-
-        __syncthreads();
-
-        if constexpr(HasMainKBlockLoop)
-        {
-            index_t e_block_data_begin = 0;
-
-            // LDS double buffer: main body
-            // use Do-While loop instead of For loop to simplify control flow
-            do
-            {
-                // even iteration
-                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
-                                                         b_thread_slice_copy_step);
-
-                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
-                                          b_global_buf,
-                                          b_e_n_ho_wo_thread_desc,
-                                          make_tuple(I0, I0, I0, I0),
-                                          b_thread_odd_buf,
-                                          b_e_n_ho_wo_global_step_hacks);
-
-                // LDS double buffer: GEMM on current data
-                // TODO: @Zhang Jing: blockwise gemm should be able to move slice window
-                blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
-
-                blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
-
-                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
-                                                         b_thread_slice_copy_step);
-
-                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
-                                          b_global_buf,
-                                          b_e_n_ho_wo_thread_desc,
-                                          make_tuple(I0, I0, I0, I0),
-                                          b_thread_even_buf,
-                                          b_e_n_ho_wo_global_step_hacks);
-
-                // LDS double buffer: GEMM on current data
-                blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
-
-                blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
-
-                e_block_data_begin += 2 * EPerBlock;
-
-            } while(e_block_data_begin < E - 2 * EPerBlock);
-        }
-
-        // LDS double buffer: tail
-        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
-        {
-            b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
-                                                     b_thread_slice_copy_step);
-
-            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
-                                      b_global_buf,
-                                      b_e_n_ho_wo_thread_desc,
-                                      make_tuple(I0, I0, I0, I0),
-                                      b_thread_odd_buf,
-                                      b_e_n_ho_wo_global_step_hacks);
-
-            // LDS double buffer: GEMM on 2nd-last data
-            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
-
-            blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
-
-            // LDS double buffer: GEMM on last data
-            blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
-        }
-        else // if has 1 iteration left
-        {
-            // LDS double buffer: GEMM on last data
-            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
-        }
-
-        // output: register to global memory
-        {
-            // hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
-            constexpr auto c_k_n_ho_wo_global_tensor_step_hacks = CGlobalStepHacks{};
-
-            const index_t k_thread_data_on_global =
-                k_block_data_on_global + k_thread_id * KPerThread;
-
-            ThreadwiseTensorSliceTransfer_v1r3<FloatAcc,
-                                               FloatC,
-                                               decltype(c_k_n_ho_wo_thread_desc),
-                                               decltype(c_k_n_ho_wo_global_desc),
-                                               Sequence<KPerThread, 1, HoPerThread, WoPerThread>,
-                                               CThreadTransferSrcDstAccessOrder,
-                                               CThreadTransferSrcDstVectorDim,
-                                               CThreadTransferDstScalarPerVector,
-                                               CGlobalMemoryDataOperation,
-                                               1,
-                                               true>(
-                c_k_n_ho_wo_global_desc,
-                make_multi_index(
-                    k_thread_data_on_global, 0, ho_thread_data_on_global, wo_thread_data_on_global))
-                .Run(c_k_n_ho_wo_thread_desc,
-                     make_tuple(I0, I0, I0, I0),
-                     c_thread_buf,
-                     c_k_n_ho_wo_global_desc,
-                     c_global_buf,
-                     c_k_n_ho_wo_global_tensor_step_hacks);
-        }
-    }
-
-    // pass tensor descriptor by reference
-    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
-    __device__ void Run(const AGlobalDesc& a_e_k_global_desc,
-                        const FloatAB* __restrict__ p_a_global,
-                        const BGlobalDesc& b_e_n_ho_wo_global_desc,
-                        const FloatAB* __restrict__ p_b_global,
-                        const CGlobalDesc& c_k_n_ho_wo_global_desc,
-                        FloatC* __restrict__ p_c_global,
-                        integral_constant<bool, HasMainKBlockLoop>,
-                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
-    {
-        constexpr index_t shared_block_size = GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-        __shared__ FloatAB p_shared_block[shared_block_size];
-
-        Run(a_e_k_global_desc,
-            p_a_global,
-            b_e_n_ho_wo_global_desc,
-            p_b_global,
-            c_k_n_ho_wo_global_desc,
-            p_c_global,
-            p_shared_block,
-            integral_constant<bool, HasMainKBlockLoop>{},
-            integral_constant<bool, HasDoubleTailKBlockLoop>{});
-    }
-
-    // pass tensor descriptors by their pointers
-    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
-    __device__ void Run(const AGlobalDesc* p_a_e_k_global_desc,
-                        const FloatAB* __restrict__ p_a_global,
-                        const BGlobalDesc* p_b_e_n_ho_wo_global_desc,
-                        const FloatAB* __restrict__ p_b_global,
-                        const CGlobalDesc* p_c_k_n_ho_wo_global_desc,
-                        FloatC* __restrict__ p_c_global,
-                        integral_constant<bool, HasMainKBlockLoop>,
-                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
-    {
-        const auto a_e_k_global_desc       = *p_a_e_k_global_desc;
-        const auto b_e_n_ho_wo_global_desc = *p_b_e_n_ho_wo_global_desc;
-        const auto c_k_n_ho_wo_global_desc = *p_c_k_n_ho_wo_global_desc;
-
-        Run(a_e_k_global_desc,
-            p_a_global,
-            b_e_n_ho_wo_global_desc,
-            p_b_global,
-            c_k_n_ho_wo_global_desc,
-            p_c_global,
-            integral_constant<bool, HasMainKBlockLoop>{},
-            integral_constant<bool, HasDoubleTailKBlockLoop>{});
-    }
-
-    // pass tensor descriptors by void*
-    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
-    __device__ void Run(const void* p_a_e_k_global_desc,
-                        const FloatAB* __restrict__ p_a_global,
-                        const void* p_b_e_n_ho_wo_global_desc,
-                        const FloatAB* __restrict__ p_b_global,
-                        const void* p_c_k_n_ho_wo_global_desc,
-                        FloatC* __restrict__ p_c_global,
-                        integral_constant<bool, HasMainKBlockLoop>,
-                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
-    {
-        const auto a_e_k_global_desc = *reinterpret_cast<const AGlobalDesc*>(p_a_e_k_global_desc);
-        const auto b_e_n_ho_wo_global_desc =
-            *reinterpret_cast<const BGlobalDesc*>(p_b_e_n_ho_wo_global_desc);
-        const auto c_k_n_ho_wo_global_desc =
-            *reinterpret_cast<const CGlobalDesc*>(p_c_k_n_ho_wo_global_desc);
-
-        Run(a_e_k_global_desc,
-            p_a_global,
-            b_e_n_ho_wo_global_desc,
-            p_b_global,
-            c_k_n_ho_wo_global_desc,
-            p_c_global,
-            integral_constant<bool, HasMainKBlockLoop>{},
-            integral_constant<bool, HasDoubleTailKBlockLoop>{});
-    }
-};
-
-} // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp
deleted file mode 100644
index ace844338..000000000
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp
+++ /dev/null
@@ -1,1597 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_GRIDWISE_GEMM_V3_HPP
-#define CK_GRIDWISE_GEMM_V3_HPP
-
-#include "common_header.hpp"
-#include "multi_index_transform_helper.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "blockwise_tensor_slice_transfer.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
-#include "threadwise_tensor_slice_set.hpp"
-#include "blockwise_gemm_dlops_v3.hpp"
-
-namespace ck {
-
-template <typename GridwiseGemm,
-          typename FloatAB,
-          typename FloatC,
-          typename AGridDesc_E0_E1_K0_K1_E2,
-          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
-          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
-          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
-          bool HasMainE0BlockLoop,
-          ActivTypeEnum ActivType>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        kernel_gemm_dlops_v3(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            const FloatC* __restrict__ p_bias_grid,
-            FloatC* __restrict__ p_c_grid,
-            const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc,
-            const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-            const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-            const CBlockIdToBlockClusterAdaptor_K_N_H_W cblockid_to_k_n_h_w_block_cluster_adaptor)
-{
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseGemm::ConvBiasActiv(p_a_grid,
-                                p_b_grid,
-                                p_bias_grid,
-                                p_c_grid,
-                                p_shared_block,
-                                a_e0_e1_k0_k1_e2_grid_desc,
-                                b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-                                cblockid_to_k_n_h_w_block_cluster_adaptor,
-                                integral_constant<bool, HasMainE0BlockLoop>{},
-                                integral_constant<ActivTypeEnum, ActivType>{});
-}
-
-template <typename GridwiseGemm,
-          typename FloatAB,
-          typename FloatC,
-          typename AGridDesc_E0_E1_K0_K1_E2,
-          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
-          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
-          typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
-          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
-          bool HasMainE0BlockLoop,
-          ActivTypeEnum ActivType>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        kernel_gemm_dlops_v3_resize_add(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            const FloatC* __restrict__ p_bias_grid,
-            FloatC* __restrict__ p_d_grid,
-            const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc,
-            const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-            const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-            const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
-            const CBlockIdToBlockClusterAdaptor_K_N_H_W cblockid_to_k_n_h_w_block_cluster_adaptor)
-{
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseGemm::ConvBiasActivResizeAdd(p_a_grid,
-                                         p_b_grid,
-                                         p_bias_grid,
-                                         p_d_grid,
-                                         p_shared_block,
-                                         a_e0_e1_k0_k1_e2_grid_desc,
-                                         b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                         c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-                                         d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
-                                         cblockid_to_k_n_h_w_block_cluster_adaptor,
-                                         integral_constant<bool, HasMainE0BlockLoop>{},
-                                         integral_constant<ActivTypeEnum, ActivType>{});
-}
-
-template <typename GridwiseGemm,
-          typename FloatAB,
-          typename FloatC,
-          typename AGridDesc_E0_E1_K0_K1_E2,
-          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
-          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
-          typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
-          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
-          bool HasMainE0BlockLoop,
-          ActivTypeEnum ActivType>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        kernel_gemm_dlops_v3_maxpool(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            const FloatC* __restrict__ p_bias_grid,
-            FloatC* __restrict__ p_c_grid,
-            FloatC* __restrict__ p_d_grid,
-            const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc,
-            const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-            const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-            const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
-            const CBlockIdToBlockClusterAdaptor_K_N_H_W cblockid_to_k_n_h_w_block_cluster_adaptor)
-{
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseGemm::ConvBiasActivMaxpool(p_a_grid,
-                                       p_b_grid,
-                                       p_bias_grid,
-                                       p_c_grid,
-                                       p_d_grid,
-                                       p_shared_block,
-                                       a_e0_e1_k0_k1_e2_grid_desc,
-                                       b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                       c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-                                       d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
-                                       cblockid_to_k_n_h_w_block_cluster_adaptor,
-                                       integral_constant<bool, HasMainE0BlockLoop>{},
-                                       integral_constant<ActivTypeEnum, ActivType>{});
-}
-
-template <index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AGridDesc_E0_E1_K_E2,
-          typename BGridDesc_E0_E1_N_Ho_Wo_E2,
-          typename CGridDesc_K_N_Ho_Wo,
-          typename DGridDesc_K_N_Hx_Wx,
-          index_t E1_,
-          index_t E2_,
-          index_t K2_,
-          index_t KPerBlock,
-          index_t HoPerBlock,
-          index_t WoPerBlock,
-          index_t E1PerBlock,
-          index_t KPerThread,
-          index_t HoPerThread,
-          index_t WoPerThread,
-          index_t EPerThread,
-          typename ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
-          typename ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          index_t ABlockTransferSrcVectorDim,
-          index_t ABlockTransferSrcScalarPerVector,
-          index_t ABlockTransferDstScalarPerVector_E2,
-          bool AThreadTransferSrcResetCoordinateAfterRun,
-          typename BBlockTransferSrcAccessOrder,
-          index_t BBlockTransferSrcVectorDim,
-          index_t BBlockTransferSrcScalarPerVector,
-          bool BThreadTransferSrcResetCoordinateAfterRun,
-          typename CThreadTransferSrcDstAccessOrder,
-          index_t CThreadTransferSrcDstVectorDim,
-          index_t CThreadTransferDstScalarPerVector,
-          typename AGlobalStepHacks,
-          typename BGlobalStepHacks,
-          typename CGlobalStepHacks,
-          typename DGlobalStepHacks,
-          typename AGlobalMoveSliceWindowStepHacks,
-          typename BGlobalMoveSliceWindowStepHacks>
-struct GridwiseGemmDlops_km_kn_mn_v3
-{
-
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-    static constexpr auto I2 = Number<2>{};
-    static constexpr auto I3 = Number<3>{};
-    static constexpr auto I4 = Number<4>{};
-    static constexpr auto I5 = Number<5>{};
-
-    static constexpr auto E1 = Number<E1_>{};
-    static constexpr auto E2 = Number<E2_>{};
-    static constexpr auto K2 = Number<K2_>{};
-
-    static constexpr auto NPerBlock = I1;
-
-    static constexpr FloatAcc alpha = 0.3;
-
-    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
-    {
-        constexpr auto max_lds_align = Number<ABlockTransferDstScalarPerVector_E2>{};
-
-        // A matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto a_e0_e1_k1_e2_block_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(I1, Number<E1>{}, Number<KPerBlock>{}, Number<E2>{}), max_lds_align);
-
-        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_space_size = math::integer_least_multiple(
-            a_e0_e1_k1_e2_block_desc.GetElementSpaceSize(), max_lds_align);
-
-        return a_block_space_size * sizeof(FloatAB);
-    }
-
-    __host__ __device__ static constexpr index_t
-    CalculateGridSize(const CGridDesc_K_N_Ho_Wo& c_k_n_ho_wo_grid_desc)
-    {
-        const auto K  = c_k_n_ho_wo_grid_desc.GetLength(I0);
-        const auto N  = c_k_n_ho_wo_grid_desc.GetLength(I1);
-        const auto Ho = c_k_n_ho_wo_grid_desc.GetLength(I2);
-        const auto Wo = c_k_n_ho_wo_grid_desc.GetLength(I3);
-
-        const auto K0 = K / KPerBlock;
-        const auto N0 = N / NPerBlock;
-        const auto H0 = Ho / HoPerBlock;
-        const auto W0 = Wo / WoPerBlock;
-
-        const index_t grid_size = K0 * N0 * H0 * W0;
-
-        return grid_size;
-    }
-
-    __host__ __device__ static constexpr bool CalculateHasMainE0BlockLoop(const index_t E0)
-    {
-        const bool has_main_e0_block_loop = E0 > 1;
-
-        return has_main_e0_block_loop;
-    }
-
-    __host__ __device__ static constexpr bool CalculateHasMainE1BlockLoop()
-    {
-        const bool has_main_e1_block_loop = ((E1 + E1PerBlock) / (2 * E1PerBlock)) > 1;
-
-        return has_main_e1_block_loop;
-    }
-
-    __host__ __device__ static constexpr bool CalculateHasDoubleTailE1BlockLoop()
-    {
-        const bool has_double_tail_e1_block_loop = (E1 / E1PerBlock) % 2 == 0;
-
-        return has_double_tail_e1_block_loop;
-    }
-
-    __host__ __device__ static constexpr auto
-    MakeAE0E1K0K1E2GridDescriptor(const AGridDesc_E0_E1_K_E2& a_e0_e1_k_e2_grid_desc)
-    {
-        const auto E0 = a_e0_e1_k_e2_grid_desc.GetLength(I0);
-        const auto K  = a_e0_e1_k_e2_grid_desc.GetLength(I2);
-
-        const auto K1 = Number<KPerBlock>{};
-        const auto K0 = K / K1;
-
-        const auto a_e0_e1_k0_k1_e2_grid_desc = transform_tensor_descriptor(
-            a_e0_e1_k_e2_grid_desc,
-            make_tuple(make_pass_through_transform(E0),
-                       make_pass_through_transform(E1),
-                       make_unmerge_transform(make_tuple(K0, K1)),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}));
-
-        return a_e0_e1_k0_k1_e2_grid_desc;
-    }
-
-    __host__ __device__ static constexpr auto MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(
-        const BGridDesc_E0_E1_N_Ho_Wo_E2& b_e0_e1_n_ho_wo_e2_grid_desc)
-    {
-        const auto E0 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I0);
-        // const auto E1 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I1);
-        const auto N  = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I2);
-        const auto Ho = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I3);
-        const auto Wo = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I4);
-        // const auto E2 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I5);
-
-        const auto H2 = Number<HoPerThread>{};
-        const auto H1 = Number<HoPerBlock / HoPerThread>{};
-        const auto H0 = Ho / (H1 * H2);
-
-        const auto W2 = Number<WoPerThread>{};
-        const auto W1 = Number<WoPerBlock / WoPerThread>{};
-        const auto W0 = Wo / (W1 * W2);
-
-        const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
-            transform_tensor_descriptor(b_e0_e1_n_ho_wo_e2_grid_desc,
-                                        make_tuple(make_pass_through_transform(E0),
-                                                   make_pass_through_transform(E1),
-                                                   make_pass_through_transform(N),
-                                                   make_unmerge_transform(make_tuple(H0, H1, H2)),
-                                                   make_unmerge_transform(make_tuple(W0, W1, W2)),
-                                                   make_pass_through_transform(E2)),
-                                        make_tuple(Sequence<0>{},
-                                                   Sequence<1>{},
-                                                   Sequence<2>{},
-                                                   Sequence<3>{},
-                                                   Sequence<4>{},
-                                                   Sequence<5>{}),
-                                        make_tuple(Sequence<0>{},
-                                                   Sequence<1>{},
-                                                   Sequence<2>{},
-                                                   Sequence<3, 4, 5>{},
-                                                   Sequence<6, 7, 8>{},
-                                                   Sequence<9>{}));
-
-        return b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc;
-    }
-
-    __host__ __device__ static constexpr auto
-    MakeCK0K1NH0H1H2W0W1W2GridDescriptor(const CGridDesc_K_N_Ho_Wo& c_k_n_ho_wo_grid_desc)
-    {
-        const auto K  = c_k_n_ho_wo_grid_desc.GetLength(I0);
-        const auto N  = c_k_n_ho_wo_grid_desc.GetLength(I1);
-        const auto Ho = c_k_n_ho_wo_grid_desc.GetLength(I2);
-        const auto Wo = c_k_n_ho_wo_grid_desc.GetLength(I3);
-
-        const auto K1 = Number<KPerBlock>{};
-        const auto K0 = K / K1;
-
-        const auto H2 = Number<HoPerThread>{};
-        const auto H1 = Number<HoPerBlock / HoPerThread>{};
-        const auto H0 = Ho / (H1 * H2);
-
-        const auto W2 = Number<WoPerThread>{};
-        const auto W1 = Number<WoPerBlock / WoPerThread>{};
-        const auto W0 = Wo / (W1 * W2);
-
-        const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = transform_tensor_descriptor(
-            c_k_n_ho_wo_grid_desc,
-            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
-                       make_pass_through_transform(N),
-                       make_unmerge_transform(make_tuple(H0, H1, H2)),
-                       make_unmerge_transform(make_tuple(W0, W1, W2))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}, Sequence<6, 7, 8>{}));
-
-        return c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc;
-    }
-
-    __host__ __device__ static constexpr auto
-    MakeDK0K1NH0H1HxW0W1WxGridDescriptorMaxPool(const DGridDesc_K_N_Hx_Wx& d_k_n_hx_wx_grid_desc)
-    {
-        const auto K  = d_k_n_hx_wx_grid_desc.GetLength(I0);
-        const auto N  = d_k_n_hx_wx_grid_desc.GetLength(I1);
-        const auto Hx = d_k_n_hx_wx_grid_desc.GetLength(I2);
-        const auto Wx = d_k_n_hx_wx_grid_desc.GetLength(I3);
-
-        const auto K1 = Number<KPerBlock>{};
-        const auto K0 = K / K1;
-
-#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
-        const auto H2 = Number<HoPerThread / 2>{};
-        const auto H1 = Number<HoPerBlock / HoPerThread>{};
-        const auto H0 = Number<Hx / (H1 * H2)>{};
-
-        const auto W2 = Number<WoPerThread / 2>{};
-        const auto W1 = Number<WoPerBlock / WoPerThread>{};
-        const auto W0 = Number<Wx / (W1 * W2)>{};
-#else
-        const auto H2 = HoPerThread / 2;
-        const auto H1 = HoPerBlock / HoPerThread;
-        const auto H0 = Hx / (H1 * H2);
-
-        const auto W2 = WoPerThread / 2;
-        const auto W1 = WoPerBlock / WoPerThread;
-        const auto W0 = Wx / (W1 * W2);
-#endif
-
-        const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = transform_tensor_descriptor(
-            d_k_n_hx_wx_grid_desc,
-            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
-                       make_pass_through_transform(N),
-                       make_unmerge_transform(make_tuple(H0, H1, H2)),
-                       make_unmerge_transform(make_tuple(W0, W1, W2))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}, Sequence<6, 7, 8>{}));
-
-        return d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc;
-    }
-
-    __host__ __device__ static constexpr auto
-    MakeDK0K1NH0H1HxW0W1WxGridDescriptorResizeAdd(const DGridDesc_K_N_Hx_Wx& d_k_n_hx_wx_grid_desc)
-    {
-        const auto K  = d_k_n_hx_wx_grid_desc.GetLength(I0);
-        const auto N  = d_k_n_hx_wx_grid_desc.GetLength(I1);
-        const auto Hx = d_k_n_hx_wx_grid_desc.GetLength(I2);
-        const auto Wx = d_k_n_hx_wx_grid_desc.GetLength(I3);
-
-        const auto K1 = Number<KPerBlock>{};
-        const auto K0 = K / K1;
-
-        const auto H2 = Number<HoPerThread * 2>{};
-        const auto H1 = Number<HoPerBlock / HoPerThread>{};
-
-        const auto W2 = Number<WoPerThread * 2>{};
-        const auto W1 = Number<WoPerBlock / WoPerThread>{};
-
-#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
-        const auto H0 = Number<Hx / (H1 * H2)>{};
-        const auto W0 = Number<Wx / (W1 * W2)>{};
-#else
-        const auto H0 = Hx / (H1 * H2);
-        const auto W0 = Wx / (W1 * W2);
-#endif
-
-        const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = transform_tensor_descriptor(
-            d_k_n_hx_wx_grid_desc,
-            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
-                       make_pass_through_transform(N),
-                       make_unmerge_transform(make_tuple(H0, H1, H2)),
-                       make_unmerge_transform(make_tuple(W0, W1, W2))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}, Sequence<6, 7, 8>{}));
-
-        return d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc;
-    }
-
-    __host__ __device__ static constexpr auto
-    MakeCBlockIdToKNHoWoBlockClusterAdaptor(const CGridDesc_K_N_Ho_Wo& c_k_n_ho_wo_grid_desc)
-    {
-        const auto K  = c_k_n_ho_wo_grid_desc.GetLength(I0);
-        const auto N  = c_k_n_ho_wo_grid_desc.GetLength(I1);
-        const auto Ho = c_k_n_ho_wo_grid_desc.GetLength(I2);
-        const auto Wo = c_k_n_ho_wo_grid_desc.GetLength(I3);
-
-#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
-        const auto K0 = Number<K / KPerBlock>{};
-        const auto N0 = Number<N / NPerBlock>{};
-        const auto H0 = Number<Ho / HoPerBlock>{};
-        const auto W0 = Number<Wo / WoPerBlock>{};
-#else
-        const auto K0 = K / KPerBlock;
-        const auto N0 = N / NPerBlock;
-        const auto H0 = Ho / HoPerBlock;
-        const auto W0 = Wo / WoPerBlock;
-#endif
-
-        const auto cblockid_to_k_n_ho_wo_block_cluster_adaptor = make_single_stage_tensor_adaptor(
-            make_tuple(make_merge_transform(make_tuple(K0, N0, H0, W0))),
-            make_tuple(Sequence<0, 1, 2, 3>{}),
-            make_tuple(Sequence<0>{}));
-
-        return cblockid_to_k_n_ho_wo_block_cluster_adaptor;
-    }
-
-    // using AGridDesc_E0_E1_K0_K1_E2 =
-    // decltype(MakeAE0E1K0K1E2GridDescriptor(AGridDesc_E0_E1_K_E2{}));
-    // using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 =
-    // decltype(MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(BGridDesc_E0_E1_N_Ho_Wo_E2{}));
-    // using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 =
-    // decltype(MakeCK0K1NH0H1H2W0W1W2GridDescriptor(CGridDesc_K_N_Ho_Wo{}));
-    // using DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx =
-    // decltype(MakeDK0K1NH0H1HxW0W1WxGridDescriptor(DGridDesc_K_N_Hx_Wx{}));
-
-    using CBlockIdToBlockClusterAdaptor_K_N_H_W =
-        decltype(MakeCBlockIdToKNHoWoBlockClusterAdaptor(CGridDesc_K_N_Ho_Wo{}));
-
-    template <typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>
-    __host__ __device__ static constexpr auto MakeBiasK0K1GridDescriptor(
-        const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc)
-    {
-        const auto K0 = c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetLength(I0);
-        const auto K1 = c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetLength(I1);
-
-        return make_naive_tensor_descriptor_packed(make_tuple(K0, K1));
-    }
-
-    __host__ __device__ static constexpr auto MakeCK1NH2W2ThreadDescriptor()
-    {
-        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<KPerThread>{}, I1, Number<HoPerThread>{}, Number<WoPerThread>{}));
-        return c_k1_n_h2_w2_thread_gemm_desc;
-    }
-
-    // using CThreadDesc_K1_N_H2_W2 = decltype(MakeCK1NH2W2ThreadDescriptor());
-
-    __host__ __device__ static constexpr auto GetBlockWiseGemm()
-    {
-        constexpr auto max_lds_align = Number<ABlockTransferDstScalarPerVector_E2>{};
-
-        constexpr auto a_e1_k1_e2_block_gemm_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<E1PerBlock>{}, Number<KPerBlock>{}, Number<E2>{}), max_lds_align);
-
-        constexpr auto b_e1_n_h_w_e2_block_gemm_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<E1PerBlock>{},
-                                                           I1,
-                                                           Number<HoPerBlock>{},
-                                                           Number<WoPerBlock>{},
-                                                           Number<E2>{}));
-
-        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor();
-
-        auto blockwise_gemm =
-            BlockwiseGemmDlops_km_kn_m0m1n0n1_v3<BlockSize,
-                                                 FloatAB,
-                                                 FloatAB,
-                                                 FloatAcc,
-                                                 decltype(a_e1_k1_e2_block_gemm_desc),
-                                                 decltype(b_e1_n_h_w_e2_block_gemm_desc),
-                                                 decltype(c_k1_n_h2_w2_thread_gemm_desc),
-                                                 EPerThread,
-                                                 K2>{};
-
-        return blockwise_gemm;
-    }
-
-    __device__ static constexpr auto GetCThreadIndex()
-    {
-        auto blockwise_gemm = GetBlockWiseGemm();
-        auto c_thread_mtx_index =
-            blockwise_gemm.GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id());
-
-        return c_thread_mtx_index;
-    };
-
-    __device__ static constexpr auto GetCBlockIndex(
-        const CBlockIdToBlockClusterAdaptor_K_N_H_W& cblockid_to_k_n_h_w_block_cluster_adaptor)
-    {
-        const auto c_k_n_h_w_block_cluster_idx =
-            cblockid_to_k_n_h_w_block_cluster_adaptor.CalculateBottomIndex(
-                make_multi_index(get_block_1d_id()));
-        return c_k_n_h_w_block_cluster_idx;
-    }
-
-    template <typename BiasGlobalBuff,
-              typename CThreadBuff,
-              typename CBlockIndex,
-              typename CThreadIndex,
-              typename BiasGridDesc_K0_K1,
-              typename CThreadDesc_K1_N_H2_W2>
-    __device__ static void BiasOp(BiasGlobalBuff& bias_global_buf,
-                                  CThreadBuff& c_thread_buf,
-                                  const CBlockIndex& c_block_idx,
-                                  const CThreadIndex& c_thread_idx,
-                                  const BiasGridDesc_K0_K1& bias_k0_k1_grid_desc,
-                                  const CThreadDesc_K1_N_H2_W2&)
-
-    {
-        const index_t k_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I0]);
-
-        const auto k_thread_id = c_thread_idx[I0];
-
-        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
-
-        constexpr auto bias_k0_k1_thread_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(I1, Number<KPerThread>{}));
-
-        StaticBuffer<AddressSpaceEnum::Vgpr,
-                     FloatC,
-                     bias_k0_k1_thread_desc.GetElementSpaceSize(),
-                     true>
-            bias_thread_buf;
-
-        const index_t k_thread_data_on_global = k_thread_id * KPerThread;
-
-        auto bias_threadwise_transfer =
-            ThreadwiseTensorSliceTransfer_v2<FloatC,
-                                             FloatC,
-                                             decltype(bias_k0_k1_grid_desc),
-                                             decltype(bias_k0_k1_thread_desc),
-                                             Sequence<I1, Number<KPerThread>{}>,
-                                             Sequence<0, 1>,
-                                             1,
-                                             CThreadTransferDstScalarPerVector,
-                                             false,
-                                             true>(
-                bias_k0_k1_grid_desc, make_multi_index(k_block_work_id, k_thread_data_on_global));
-
-        constexpr auto bias_k0_k1_global_tensor_step_hacks = make_tuple(
-            make_tuple(Sequence<0>{}, Sequence<0>{}), make_tuple(Sequence<0>{}, Sequence<0>{}));
-
-        bias_threadwise_transfer.Run(bias_k0_k1_grid_desc,
-                                     bias_global_buf,
-                                     bias_k0_k1_thread_desc,
-                                     make_tuple(I0, I0),
-                                     bias_thread_buf,
-                                     bias_k0_k1_global_tensor_step_hacks);
-
-        static_for<0, KPerThread, 1>{}([&](auto ki) {
-            static_for<0, HoPerThread, 1>{}([&](auto hi) {
-                static_for<0, WoPerThread, 1>{}([&](auto wi) {
-                    constexpr index_t c_offset =
-                        c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(make_tuple(ki, 0, hi, wi));
-                    c_thread_buf(Number<c_offset>{}) =
-                        c_thread_buf[Number<c_offset>{}] + bias_thread_buf[ki];
-                });
-            });
-        });
-    }
-
-    template <typename CThreadBuff, typename CThreadDesc_K1_N_H2_W2, ActivTypeEnum activ_type_>
-    __device__ static void Activation(CThreadBuff& c_thread_buf,
-                                      const CThreadDesc_K1_N_H2_W2&,
-                                      integral_constant<ActivTypeEnum, activ_type_>)
-    {
-        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
-
-        static_for<0, c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(), 1>{}([&](auto i) {
-            if constexpr(activ_type_ == 1)
-            {
-                c_thread_buf(i) = c_thread_buf[i] >= 0 ? c_thread_buf[i] : alpha * c_thread_buf[i];
-            }
-            else if constexpr(activ_type_ == 2)
-            {
-                FloatAcc x = 1.0 + exp(-c_thread_buf[i]);
-
-                asm volatile("\n \
-                        v_rcp_f32 %0, %1 \n"
-                             : "=v"(x)
-                             : "0"(x));
-
-                c_thread_buf(i) = x;
-            }
-        });
-    }
-
-    template <typename CThreadBuff,
-              typename CGlobalBuff,
-              typename CBlockIndex,
-              typename CThreadIndex,
-              typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>
-    __device__ static void
-    WriteOut(const CThreadBuff& c_thread_buf,
-             CGlobalBuff& c_global_buf,
-             const CBlockIndex& c_block_idx,
-             const CThreadIndex& c_thread_idx,
-             const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc)
-    {
-        const index_t k_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I0]);
-        const index_t n_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I1]);
-        const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]);
-        const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]);
-
-        const auto k_thread_id  = c_thread_idx[I0];
-        const auto ho_thread_id = c_thread_idx[I2];
-        const auto wo_thread_id = c_thread_idx[I3];
-
-        // hack to control index calculation when iterating over c_k_n_h0_h1_h2_w0_w1_w2_global
-        // tensor
-        constexpr auto c_k_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks = CGlobalStepHacks{};
-
-        constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_thread_copy_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(I1,
-                                                           Number<KPerThread>{},
-                                                           I1,
-                                                           I1,
-                                                           I1,
-                                                           Number<HoPerThread>{},
-                                                           I1,
-                                                           I1,
-                                                           Number<WoPerThread>{}));
-
-        const index_t k_thread_data_on_global = k_thread_id * KPerThread;
-
-        ThreadwiseTensorSliceTransfer_v1r3<
-            FloatAcc,
-            FloatC,
-            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_thread_copy_desc),
-            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc),
-            Sequence<I1, KPerThread, I1, I1, I1, HoPerThread, I1, I1, WoPerThread>,
-            CThreadTransferSrcDstAccessOrder,
-            CThreadTransferSrcDstVectorDim,
-            CThreadTransferDstScalarPerVector,
-            CGlobalMemoryDataOperation,
-            1,
-            true>(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-                  make_multi_index(k_block_work_id,
-                                   k_thread_data_on_global,
-                                   n_block_work_id,
-                                   ho_block_work_id,
-                                   ho_thread_id,
-                                   0,
-                                   wo_block_work_id,
-                                   wo_thread_id,
-                                   0))
-            .Run(c_k0_k1_n_h0_h1_h2_w0_w1_w2_thread_copy_desc,
-                 make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                 c_thread_buf,
-                 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-                 c_global_buf,
-                 c_k_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks);
-    }
-
-    template <typename CThreadBuff,
-              typename DGlobalBuff,
-              typename CBlockIndex,
-              typename CThreadIndex,
-              typename CThreadDesc_K1_N_H2_W2,
-              typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>
-    __device__ static void
-    MaxPool(const CThreadBuff& c_thread_buf,
-            DGlobalBuff& d_global_buf,
-            const CBlockIndex& c_block_idx,
-            const CThreadIndex& c_thread_idx,
-            const CThreadDesc_K1_N_H2_W2&,
-            const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc)
-    {
-
-        const index_t k_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I0]);
-        const index_t n_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I1]);
-        const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]);
-        const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]);
-
-        const auto k_thread_id  = c_thread_idx[I0];
-        const auto ho_thread_id = c_thread_idx[I2];
-        const auto wo_thread_id = c_thread_idx[I3];
-
-        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
-
-        static_assert(HoPerThread % 2 == 0 && WoPerThread % 2 == 0, "");
-
-        constexpr auto HoPerThread_2 = HoPerThread / 2;
-        constexpr auto WoPerThread_2 = WoPerThread / 2;
-
-        constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(I1,
-                                                           Number<KPerThread>{},
-                                                           I1,
-                                                           I1,
-                                                           I1,
-                                                           Number<HoPerThread_2>{},
-                                                           I1,
-                                                           I1,
-                                                           Number<WoPerThread_2>{}));
-
-        StaticBuffer<AddressSpaceEnum::Vgpr,
-                     FloatC,
-                     d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc.GetElementSpaceSize(),
-                     true>
-            d_thread_buf;
-
-        static_for<0, KPerThread, 1>{}([&](auto ki) {
-            static_for<0, HoPerThread_2, 1>{}([&](auto hi) {
-                static_for<0, WoPerThread_2, 1>{}([&](auto wi) {
-                    constexpr index_t d_offset =
-                        d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc.CalculateOffset(
-                            make_tuple(0, ki, 0, 0, 0, hi, 0, 0, wi));
-
-                    constexpr index_t c_offset_0 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
-                        make_tuple(ki, 0, hi * 2, wi * 2));
-                    constexpr index_t c_offset_1 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
-                        make_tuple(ki, 0, hi * 2, wi * 2 + 1));
-                    constexpr index_t c_offset_2 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
-                        make_tuple(ki, 0, hi * 2 + 1, wi * 2));
-                    constexpr index_t c_offset_3 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
-                        make_tuple(ki, 0, hi * 2 + 1, wi * 2 + 1));
-
-                    d_thread_buf(Number<d_offset>{}) = c_thread_buf[Number<c_offset_0>{}];
-                    d_thread_buf(Number<d_offset>{}) =
-                        fmaxf(c_thread_buf[Number<c_offset_1>{}], d_thread_buf(Number<d_offset>{}));
-                    d_thread_buf(Number<d_offset>{}) =
-                        fmaxf(c_thread_buf[Number<c_offset_2>{}], d_thread_buf(Number<d_offset>{}));
-                    d_thread_buf(Number<d_offset>{}) =
-                        fmax(c_thread_buf[Number<c_offset_3>{}], d_thread_buf(Number<d_offset>{}));
-                });
-            });
-        });
-
-        const index_t k_thread_data_on_global = k_thread_id * KPerThread;
-
-        constexpr auto d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks = DGlobalStepHacks{};
-
-        ThreadwiseTensorSliceTransfer_v1r3<
-            FloatC,
-            FloatC,
-            decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc),
-            decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc),
-            Sequence<I1, KPerThread, I1, I1, I1, HoPerThread_2, I1, I1, WoPerThread_2>,
-            CThreadTransferSrcDstAccessOrder,
-            CThreadTransferSrcDstVectorDim,
-            CThreadTransferDstScalarPerVector,
-            InMemoryDataOperationEnum::Set,
-            1,
-            true>(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
-                  make_multi_index(k_block_work_id,
-                                   k_thread_data_on_global,
-                                   n_block_work_id,
-                                   ho_block_work_id,
-                                   ho_thread_id,
-                                   0,
-                                   wo_block_work_id,
-                                   wo_thread_id,
-                                   0))
-            .Run(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc,
-                 make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                 d_thread_buf,
-                 d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
-                 d_global_buf,
-                 d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks);
-    }
-
-    template <typename CThreadBuff,
-              typename DGlobalBuff,
-              typename CBlockIndex,
-              typename CThreadIndex,
-              typename CThreadDesc_K1_N_H2_W2,
-              typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>
-    __device__ static void
-    ResizeAdd(const CThreadBuff& c_thread_buf,
-              DGlobalBuff& d_global_buf,
-              const CBlockIndex& c_block_idx,
-              const CThreadIndex& c_thread_idx,
-              const CThreadDesc_K1_N_H2_W2&,
-              const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc)
-    {
-
-        const index_t k_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I0]);
-        const index_t n_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I1]);
-        const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]);
-        const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]);
-
-        const auto k_thread_id  = c_thread_idx[I0];
-        const auto ho_thread_id = c_thread_idx[I2];
-        const auto wo_thread_id = c_thread_idx[I3];
-
-        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
-
-        constexpr auto HoPerThreadx2 = HoPerThread * 2;
-        constexpr auto WoPerThreadx2 = WoPerThread * 2;
-
-        constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(I1,
-                                                           Number<KPerThread>{},
-                                                           I1,
-                                                           I1,
-                                                           I1,
-                                                           Number<HoPerThreadx2>{},
-                                                           I1,
-                                                           I1,
-                                                           Number<WoPerThreadx2>{}));
-
-        StaticBuffer<AddressSpaceEnum::Vgpr,
-                     FloatC,
-                     d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc.GetElementSpaceSize(),
-                     true>
-            d_thread_buf;
-
-        static_for<0, KPerThread, 1>{}([&](auto k_i) {
-            static_for<0, HoPerThreadx2, 1>{}([&](auto h_i) {
-                static_for<0, WoPerThreadx2, 1>{}([&](auto w_i) {
-                    d_thread_buf(Number<d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc.CalculateOffset(
-                                     make_tuple(0, k_i, 0, 0, 0, h_i, 0, 0, w_i))>{}) =
-                        c_thread_buf[Number<c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
-                            make_tuple(k_i, 0, h_i / 2, w_i / 2))>{}];
-                });
-            });
-        });
-
-        // hack to control index calculation when iterating over d_k_n_ho_wo_global tensor
-        constexpr auto d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks = DGlobalStepHacks{};
-
-        const index_t k_thread_data_on_global = k_thread_id * KPerThread;
-
-        ThreadwiseTensorSliceTransfer_v1r3<
-            FloatC,
-            FloatC,
-            decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc),
-            decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc),
-            Sequence<I1, KPerThread, I1, I1, I1, HoPerThreadx2, I1, I1, WoPerThreadx2>,
-            CThreadTransferSrcDstAccessOrder,
-            CThreadTransferSrcDstVectorDim,
-            CThreadTransferDstScalarPerVector,
-            InMemoryDataOperationEnum::Add,
-            1,
-            true>(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
-                  make_multi_index(k_block_work_id,
-                                   k_thread_data_on_global,
-                                   n_block_work_id,
-                                   ho_block_work_id,
-                                   ho_thread_id,
-                                   0,
-                                   wo_block_work_id,
-                                   wo_thread_id,
-                                   0))
-            .Run(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc,
-                 make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                 d_thread_buf,
-                 d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
-                 d_global_buf,
-                 d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks);
-    }
-
-    template <typename AGlobalBuff,
-              typename BGlobalBuff,
-              typename CThreadBuff,
-              typename CBlockIndex,
-              typename CThreadIndex,
-              typename AGridDesc_E0_E1_K0_K1_E2,
-              typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
-              typename CThreadDesc_K1_N_H2_W2,
-              bool HasMainE0BlockLoop>
-    __device__ static void
-    GemmOp(const AGlobalBuff& a_global_buf,
-           const BGlobalBuff& b_global_buf,
-           CThreadBuff& c_thread_buf,
-           FloatAB* __restrict__ p_shared_block,
-           const CBlockIndex& c_block_idx,
-           const CThreadIndex& c_thread_idx,
-           const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
-           const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-           const CThreadDesc_K1_N_H2_W2&,
-           integral_constant<bool, HasMainE0BlockLoop>)
-    {
-        constexpr auto HasMainE1BlockLoop       = CalculateHasMainE1BlockLoop();
-        constexpr auto HasDoubleTailE1BlockLoop = CalculateHasDoubleTailE1BlockLoop();
-
-        // const auto c_k_n_h_w_block_cluster_idx =
-        // GetCBlockIndex(cblockid_to_k_n_h_w_block_cluster_adaptor);
-        // cblockid_to_k_n_h_w_block_cluster_adaptor.CalculateBottomIndex(
-        // make_multi_index(get_block_1d_id()));
-
-        const index_t k_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I0]);
-        const index_t n_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I1]);
-        const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]);
-        const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]);
-
-        constexpr auto max_lds_align = Number<ABlockTransferDstScalarPerVector_E2>{};
-
-        constexpr auto a_e1_k1_e2_block_gemm_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<E1PerBlock>{}, Number<KPerBlock>{}, Number<E2>{}), max_lds_align);
-
-        constexpr auto b_e1_n_h_w_e2_block_gemm_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<E1PerBlock>{},
-                                                           I1,
-                                                           Number<HoPerBlock>{},
-                                                           Number<WoPerBlock>{},
-                                                           Number<E2>{}));
-
-        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
-
-        auto blockwise_gemm =
-            BlockwiseGemmDlops_km_kn_m0m1n0n1_v3<BlockSize,
-                                                 FloatAB,
-                                                 FloatAB,
-                                                 FloatAcc,
-                                                 decltype(a_e1_k1_e2_block_gemm_desc),
-                                                 decltype(b_e1_n_h_w_e2_block_gemm_desc),
-                                                 decltype(c_k1_n_h2_w2_thread_gemm_desc),
-                                                 EPerThread,
-                                                 K2>{};
-        // blockwise_gemm.GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id());
-
-        const auto ho_thread_id = c_thread_idx[I2];
-        const auto wo_thread_id = c_thread_idx[I3];
-
-        constexpr auto a_e0_e1_k0_k1_e2_block_copy_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<I1>{}, Number<E1>{}, I1, Number<KPerBlock>{}, Number<E2>{}),
-            max_lds_align);
-
-        // A matrix blockwise copy
-        auto a_blockwise_copy =
-            BlockwiseTensorSliceTransfer_v4<BlockSize,
-                                            InMemoryDataOperationEnum::Set,
-                                            Sequence<I1, E1, I1, KPerBlock, E2>,
-                                            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
-                                            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
-                                            ABlockTransferThreadClusterArrangeOrder,
-                                            FloatAB,
-                                            FloatAB,
-                                            decltype(a_e0_e1_k0_k1_e2_grid_desc),
-                                            decltype(a_e0_e1_k0_k1_e2_block_copy_desc),
-                                            ABlockTransferSrcAccessOrder,
-                                            Sequence<0, 1, 2, 3, 4>,
-                                            ABlockTransferSrcVectorDim,
-                                            4,
-                                            ABlockTransferSrcScalarPerVector,
-                                            ABlockTransferDstScalarPerVector_E2,
-                                            1,
-                                            1,
-                                            AThreadTransferSrcResetCoordinateAfterRun,
-                                            false>(a_e0_e1_k0_k1_e2_grid_desc,
-                                                   make_multi_index(0, 0, k_block_work_id, 0, 0),
-                                                   a_e0_e1_k0_k1_e2_block_copy_desc,
-                                                   make_multi_index(0, 0, 0, 0, 0));
-
-        constexpr auto a_block_slice_copy_step = make_multi_index(I1, 0, 0, 0, 0);
-
-        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(I1,
-                                                           Number<E1PerBlock>{},
-                                                           I1,
-                                                           I1,
-                                                           I1,
-                                                           Number<HoPerThread>{},
-                                                           I1,
-                                                           I1,
-                                                           Number<WoPerThread>{},
-                                                           Number<E2>{}));
-
-        auto b_threadwise_transfer = ThreadwiseTensorSliceTransfer_v2<
-            FloatAB,
-            FloatAB,
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc),
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc),
-            Sequence<I1, E1PerBlock, I1, I1, I1, HoPerThread, I1, I1, WoPerThread, E2>,
-            BBlockTransferSrcAccessOrder,
-            BBlockTransferSrcVectorDim,
-            BBlockTransferSrcScalarPerVector,
-            BThreadTransferSrcResetCoordinateAfterRun,
-            true>(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                  make_multi_index(0,
-                                   0,
-                                   n_block_work_id,
-                                   ho_block_work_id,
-                                   ho_thread_id,
-                                   0,
-                                   wo_block_work_id,
-                                   wo_thread_id,
-                                   0,
-                                   0));
-
-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            p_shared_block, a_e0_e1_k0_k1_e2_block_copy_desc.GetElementSpaceSize());
-
-        //// register allocation for output
-        // StaticBuffer<AddressSpaceEnum::Vgpr,
-        // FloatAcc,
-        // c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
-        // true>
-        // c_thread_buf;
-
-        // initialize output thread tensor
-        ThreadwiseTensorSliceSet_v1<FloatAcc,
-                                    decltype(c_k1_n_h2_w2_thread_gemm_desc),
-                                    Sequence<KPerThread, I1, HoPerThread, WoPerThread>>{}
-            .Run(c_k1_n_h2_w2_thread_gemm_desc,
-                 make_tuple(I0, I0, I0, I0),
-                 c_thread_buf,
-                 FloatAcc{0});
-
-        constexpr auto b_thread_slice_copy_step =
-            make_multi_index(0, E1PerBlock, 0, 0, 0, 0, 0, 0, 0, 0);
-
-        // hack to control index calculation when iterating over A and B matrix for threadwise copy
-        constexpr auto a_e0_e1_k_e2_global_step_hacks                   = AGlobalStepHacks{};
-        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = BGlobalStepHacks{};
-
-        // double regsiter buffer for b
-        StaticBuffer<AddressSpaceEnum::Vgpr,
-                     FloatAB,
-                     b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc.GetElementSpaceSize(),
-                     true>
-            b_thread_even_buf, b_thread_odd_buf;
-
-        if constexpr(HasMainE0BlockLoop)
-        {
-            const auto E0 = b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetLength(I0);
-
-            index_t e0_block_data_begin = 0;
-
-            do
-            {
-                // LDS double buffer: preload data
-                {
-                    a_blockwise_copy.RunRead(
-                        a_e0_e1_k0_k1_e2_grid_desc, a_global_buf, a_e0_e1_k_e2_global_step_hacks);
-
-                    b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                              b_global_buf,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
-                                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                                              b_thread_even_buf,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
-
-                    a_blockwise_copy.RunWrite(a_e0_e1_k0_k1_e2_block_copy_desc, a_block_buf);
-                }
-
-                __syncthreads();
-
-                if constexpr(HasMainE1BlockLoop)
-                {
-                    index_t e1_block_data_begin = 0;
-
-                    // LDS double buffer: main body
-                    // use Do-While loop instead of For loop to simplify control flow
-                    do
-                    {
-                        // even iteration
-                        b_threadwise_transfer.MoveSrcSliceWindow(
-                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                            b_thread_slice_copy_step,
-                            BGlobalMoveSliceWindowStepHacks{});
-
-                        b_threadwise_transfer.Run(
-                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                            b_global_buf,
-                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
-                            make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                            b_thread_odd_buf,
-                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
-
-                        // LDS double buffer: GEMM on current data
-                        blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
-
-                        blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
-
-                        b_threadwise_transfer.MoveSrcSliceWindow(
-                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                            b_thread_slice_copy_step,
-                            BGlobalMoveSliceWindowStepHacks{});
-
-                        b_threadwise_transfer.Run(
-                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                            b_global_buf,
-                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
-                            make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                            b_thread_even_buf,
-                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
-
-                        // LDS double buffer: GEMM on current data
-                        blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
-
-                        blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
-
-                        e1_block_data_begin += 2 * E1PerBlock;
-
-                    } while(e1_block_data_begin < E1 - 2 * E1PerBlock);
-                }
-
-                // LDS double buffer: tail
-                if constexpr(HasDoubleTailE1BlockLoop) // if has 2 iteration left
-                {
-                    b_threadwise_transfer.MoveSrcSliceWindow(
-                        b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                        b_thread_slice_copy_step,
-                        BGlobalMoveSliceWindowStepHacks{});
-
-                    b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                              b_global_buf,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
-                                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                                              b_thread_odd_buf,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
-
-                    // LDS double buffer: GEMM on 2nd-last data
-                    blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
-
-                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
-
-                    // LDS double buffer: GEMM on last data
-                    blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
-                }
-                else // if has 1 iteration left
-                {
-                    // LDS double buffer: GEMM on last data
-                    blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
-                }
-
-                a_blockwise_copy.MoveSrcSliceWindow(a_e0_e1_k0_k1_e2_grid_desc,
-                                                    a_block_slice_copy_step,
-                                                    AGlobalMoveSliceWindowStepHacks{});
-
-                blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - E1PerBlock), 0, 0));
-
-                b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                                         b_thread_slice_copy_step,
-                                                         BGlobalMoveSliceWindowStepHacks{});
-
-                e0_block_data_begin += 1;
-
-            } while(e0_block_data_begin < E0);
-        }
-        else
-        {
-            // LDS double buffer: preload data
-            {
-                a_blockwise_copy.RunRead(
-                    a_e0_e1_k0_k1_e2_grid_desc, a_global_buf, a_e0_e1_k_e2_global_step_hacks);
-
-                b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                          b_global_buf,
-                                          b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
-                                          make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                                          b_thread_even_buf,
-                                          b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
-
-                a_blockwise_copy.RunWrite(a_e0_e1_k0_k1_e2_block_copy_desc, a_block_buf);
-            }
-
-            __syncthreads();
-
-            if constexpr(HasMainE1BlockLoop)
-            {
-                index_t e1_block_data_begin = 0;
-
-                // LDS double buffer: main body
-                // use Do-While loop instead of For loop to simplify control flow
-                do
-                {
-                    // even iteration
-                    b_threadwise_transfer.MoveSrcSliceWindow(
-                        b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                        b_thread_slice_copy_step,
-                        BGlobalMoveSliceWindowStepHacks{});
-
-                    b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                              b_global_buf,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
-                                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                                              b_thread_odd_buf,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
-
-                    // LDS double buffer: GEMM on current data
-                    blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
-
-                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
-
-                    b_threadwise_transfer.MoveSrcSliceWindow(
-                        b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                        b_thread_slice_copy_step,
-                        BGlobalMoveSliceWindowStepHacks{});
-
-                    b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                              b_global_buf,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
-                                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                                              b_thread_even_buf,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
-
-                    // LDS double buffer: GEMM on current data
-                    blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
-
-                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
-
-                    e1_block_data_begin += 2 * E1PerBlock;
-
-                } while(e1_block_data_begin < E1 - 2 * E1PerBlock);
-            }
-
-            // LDS double buffer: tail
-            if constexpr(HasDoubleTailE1BlockLoop) // if has 2 iteration left
-            {
-                b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                                         b_thread_slice_copy_step,
-                                                         BGlobalMoveSliceWindowStepHacks{});
-
-                b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                          b_global_buf,
-                                          b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
-                                          make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                                          b_thread_odd_buf,
-                                          b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
-
-                // LDS double buffer: GEMM on 2nd-last data
-                blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
-
-                blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
-
-                // LDS double buffer: GEMM on last data
-                blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
-            }
-            else // if has 1 iteration left
-            {
-                // LDS double buffer: GEMM on last data
-                blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
-            }
-        }
-    }
-
-    template <typename AGridDesc_E0_E1_K0_K1_E2,
-              typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
-              typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
-              typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
-              typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
-              bool HasMainE0BlockLoop>
-    __device__ static void
-    Conv(const FloatAB* __restrict__ p_a_global,
-         const FloatAB* __restrict__ p_b_global,
-         const FloatC* __restrict__ p_bias_global,
-         FloatC* __restrict__ p_c_global,
-         FloatC* __restrict__ p_d_global,
-         FloatAB* __restrict__ p_shared_block,
-         const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
-         const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-         const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-         const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
-         const CBlockIdToBlockClusterAdaptor_K_N_H_W& cblockid_to_k_n_h_w_block_cluster_adaptor,
-         integral_constant<bool, HasMainE0BlockLoop>)
-    {
-        const auto bias_k0_k1_grid_desc =
-            MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
-
-        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize());
-        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize());
-        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize());
-        auto d_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_d_global, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize());
-        auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
-
-        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor();
-
-        // register allocation for output
-        StaticBuffer<AddressSpaceEnum::Vgpr,
-                     FloatAcc,
-                     c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
-                     true>
-            c_thread_buf;
-
-        const auto c_k_n_h_w_block_cluster_idx =
-            GetCBlockIndex(cblockid_to_k_n_h_w_block_cluster_adaptor);
-
-        const auto c_thread_mtx_index = GetCThreadIndex();
-
-        // GemmOp
-        GemmOp(a_global_buf,
-               b_global_buf,
-               c_thread_buf,
-               p_shared_block,
-               c_k_n_h_w_block_cluster_idx,
-               c_thread_mtx_index,
-               a_e0_e1_k0_k1_e2_grid_desc,
-               b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-               c_k1_n_h2_w2_thread_gemm_desc,
-               integral_constant<bool, HasMainE0BlockLoop>{});
-
-        // Output
-        WriteOut(c_thread_buf,
-                 c_global_buf,
-                 c_k_n_h_w_block_cluster_idx,
-                 c_thread_mtx_index,
-                 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
-    }
-
-    template <typename AGridDesc_E0_E1_K0_K1_E2,
-              typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
-              typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
-              typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
-              bool HasMainE0BlockLoop,
-              ActivTypeEnum ActivType>
-    __device__ static void ConvBiasActiv(
-        const FloatAB* __restrict__ p_a_global,
-        const FloatAB* __restrict__ p_b_global,
-        const FloatC* __restrict__ p_bias_global,
-        FloatC* __restrict__ p_c_global,
-        FloatAB* __restrict__ p_shared_block,
-        const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
-        const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-        const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-        const CBlockIdToBlockClusterAdaptor_K_N_H_W& cblockid_to_k_n_h_w_block_cluster_adaptor,
-        integral_constant<bool, HasMainE0BlockLoop>,
-        integral_constant<ActivTypeEnum, ActivType>)
-    {
-        static constexpr auto activ_type = integral_constant<ActivTypeEnum, ActivType>{};
-
-        const auto bias_k0_k1_grid_desc =
-            MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
-
-        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize());
-        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize());
-        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize());
-        auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
-
-        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor();
-
-        // register allocation for output
-        StaticBuffer<AddressSpaceEnum::Vgpr,
-                     FloatAcc,
-                     c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
-                     true>
-            c_thread_buf;
-
-        const auto c_k_n_h_w_block_cluster_idx =
-            GetCBlockIndex(cblockid_to_k_n_h_w_block_cluster_adaptor);
-
-        const auto c_thread_mtx_index = GetCThreadIndex();
-
-        // GemmOp
-        GemmOp(a_global_buf,
-               b_global_buf,
-               c_thread_buf,
-               p_shared_block,
-               c_k_n_h_w_block_cluster_idx,
-               c_thread_mtx_index,
-               a_e0_e1_k0_k1_e2_grid_desc,
-               b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-               c_k1_n_h2_w2_thread_gemm_desc,
-               integral_constant<bool, HasMainE0BlockLoop>{});
-
-        // Bias
-        BiasOp(bias_global_buf,
-               c_thread_buf,
-               c_k_n_h_w_block_cluster_idx,
-               c_thread_mtx_index,
-               bias_k0_k1_grid_desc,
-               c_k1_n_h2_w2_thread_gemm_desc);
-
-        // Activ
-        Activation(c_thread_buf, c_k1_n_h2_w2_thread_gemm_desc, activ_type);
-
-        // Output
-        WriteOut(c_thread_buf,
-                 c_global_buf,
-                 c_k_n_h_w_block_cluster_idx,
-                 c_thread_mtx_index,
-                 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
-    }
-
-    template <typename AGridDesc_E0_E1_K0_K1_E2,
-              typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
-              typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
-              typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
-              typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
-              bool HasMainE0BlockLoop,
-              ActivTypeEnum ActivType>
-    __device__ static void ConvBiasActivMaxpool(
-        const FloatAB* __restrict__ p_a_global,
-        const FloatAB* __restrict__ p_b_global,
-        const FloatC* __restrict__ p_bias_global,
-        FloatC* __restrict__ p_c_global,
-        FloatC* __restrict__ p_d_global,
-        FloatAB* __restrict__ p_shared_block,
-        const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
-        const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-        const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-        const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
-        const CBlockIdToBlockClusterAdaptor_K_N_H_W& cblockid_to_k_n_h_w_block_cluster_adaptor,
-        integral_constant<bool, HasMainE0BlockLoop>,
-        integral_constant<ActivTypeEnum, ActivType>)
-    {
-        static constexpr auto activ_type = integral_constant<ActivTypeEnum, ActivType>{};
-
-        const auto bias_k0_k1_grid_desc =
-            MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
-
-        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize());
-        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize());
-        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize());
-        auto d_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_d_global, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize());
-        auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
-
-        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor();
-
-        // register allocation for output
-        StaticBuffer<AddressSpaceEnum::Vgpr,
-                     FloatAcc,
-                     c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
-                     true>
-            c_thread_buf;
-
-        const auto c_k_n_h_w_block_cluster_idx =
-            GetCBlockIndex(cblockid_to_k_n_h_w_block_cluster_adaptor);
-
-        const auto c_thread_mtx_index = GetCThreadIndex();
-
-        // GemmOp
-        GemmOp(a_global_buf,
-               b_global_buf,
-               c_thread_buf,
-               p_shared_block,
-               c_k_n_h_w_block_cluster_idx,
-               c_thread_mtx_index,
-               a_e0_e1_k0_k1_e2_grid_desc,
-               b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-               c_k1_n_h2_w2_thread_gemm_desc,
-               integral_constant<bool, HasMainE0BlockLoop>{});
-
-        // Bias
-        BiasOp(bias_global_buf,
-               c_thread_buf,
-               c_k_n_h_w_block_cluster_idx,
-               c_thread_mtx_index,
-               bias_k0_k1_grid_desc,
-               c_k1_n_h2_w2_thread_gemm_desc);
-
-        // Activ
-        Activation(c_thread_buf, c_k1_n_h2_w2_thread_gemm_desc, activ_type);
-
-        // Output
-        WriteOut(c_thread_buf,
-                 c_global_buf,
-                 c_k_n_h_w_block_cluster_idx,
-                 c_thread_mtx_index,
-                 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
-
-        // MaxPool
-        MaxPool(c_thread_buf,
-                d_global_buf,
-                c_k_n_h_w_block_cluster_idx,
-                c_thread_mtx_index,
-                c_k1_n_h2_w2_thread_gemm_desc,
-                d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc);
-    }
-
-    template <typename AGridDesc_E0_E1_K0_K1_E2,
-              typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
-              typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
-              typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
-              typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
-              bool HasMainE0BlockLoop,
-              ActivTypeEnum ActivType>
-    __device__ static void ConvBiasActivResizeAdd(
-        const FloatAB* __restrict__ p_a_global,
-        const FloatAB* __restrict__ p_b_global,
-        const FloatC* __restrict__ p_bias_global,
-        FloatC* __restrict__ p_d_global,
-        FloatAB* __restrict__ p_shared_block,
-        const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
-        const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-        const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-        const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
-        const CBlockIdToBlockClusterAdaptor_K_N_H_W& cblockid_to_k_n_h_w_block_cluster_adaptor,
-        integral_constant<bool, HasMainE0BlockLoop>,
-        integral_constant<ActivTypeEnum, ActivType>)
-    {
-        static constexpr auto activ_type = integral_constant<ActivTypeEnum, ActivType>{};
-
-        const auto bias_k0_k1_grid_desc =
-            MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
-
-        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize());
-        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize());
-        auto d_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_d_global, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize());
-        auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
-
-        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor();
-
-        // register allocation for output
-        StaticBuffer<AddressSpaceEnum::Vgpr,
-                     FloatAcc,
-                     c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
-                     true>
-            c_thread_buf;
-
-        const auto c_k_n_h_w_block_cluster_idx =
-            GetCBlockIndex(cblockid_to_k_n_h_w_block_cluster_adaptor);
-
-        const auto c_thread_mtx_index = GetCThreadIndex();
-
-        // GemmOp
-        GemmOp(a_global_buf,
-               b_global_buf,
-               c_thread_buf,
-               p_shared_block,
-               c_k_n_h_w_block_cluster_idx,
-               c_thread_mtx_index,
-               a_e0_e1_k0_k1_e2_grid_desc,
-               b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-               c_k1_n_h2_w2_thread_gemm_desc,
-               integral_constant<bool, HasMainE0BlockLoop>{});
-
-        // Bias
-        BiasOp(bias_global_buf,
-               c_thread_buf,
-               c_k_n_h_w_block_cluster_idx,
-               c_thread_mtx_index,
-               bias_k0_k1_grid_desc,
-               c_k1_n_h2_w2_thread_gemm_desc);
-
-        // Activ
-        Activation(c_thread_buf, c_k1_n_h2_w2_thread_gemm_desc, activ_type);
-
-        // Resize_Add
-        ResizeAdd(c_thread_buf,
-                  d_global_buf,
-                  c_k_n_h_w_block_cluster_idx,
-                  c_thread_mtx_index,
-                  c_k1_n_h2_w2_thread_gemm_desc,
-                  d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc);
-    }
-};
-
-} // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
deleted file mode 100644
index 6a73466ef..000000000
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
+++ /dev/null
@@ -1,886 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
-#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "static_tensor.hpp"
-
-namespace ck {
-
-namespace detail {
-// TODO: How to fix this? It uses an struct instead of lambda because lambda
-// doesn't have constructor
-template <index_t SrcVectorDim,
-          index_t SrcScalarPerVector,
-          index_t DstVectorDim,
-          index_t DstScalarPerVector>
-struct lambda_scalar_per_access_for_src_and_dst
-{
-    __host__ __device__ constexpr auto operator()(index_t i) const
-    {
-        if(i == SrcVectorDim && i == DstVectorDim)
-        {
-            return math::lcm(SrcScalarPerVector, DstScalarPerVector);
-        }
-        else if(i == SrcVectorDim)
-        {
-            return SrcScalarPerVector;
-        }
-        else if(i == DstVectorDim)
-        {
-            return DstScalarPerVector;
-        }
-        else
-        {
-            return 1;
-        }
-    }
-};
-
-} // namespace detail
-
-// Assume:
-//   1. src_desc and dst_desc are not known at compile-time
-//   2. SrcBuffer and DstBuffer are DynamicBuffer
-//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
-//   4. Use thread buffer
-template <typename SliceLengths,
-          typename SrcElementwiseOperation,
-          typename DstElementwiseOperation,
-          InMemoryDataOperationEnum DstInMemOp,
-          typename SrcData,
-          typename DstData,
-          typename SrcDesc,
-          typename DstDesc,
-          typename Dst0Desc,
-          typename Dst1Desc,
-          typename SrcDimAccessOrder,
-          typename DstDimAccessOrder,
-          index_t SrcVectorDim,
-          index_t DstVectorDim,
-          index_t SrcScalarPerVector,
-          index_t DstScalarPerVector,
-          index_t SrcScalarStrideInVector,
-          index_t DstScalarStrideInVector,
-          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
-                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
-                                           // save addr computation
-          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
-                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
-                                           // save addr computation
-struct ThreadwiseTensorSliceTransfer_v3r3
-{
-    static constexpr index_t nDim = SliceLengths::Size();
-    using Index                   = MultiIndex<nDim>;
-
-    using SrcCoord  = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
-    using DstCoord  = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
-    using Dst0Coord = decltype(make_tensor_coordinate(Dst0Desc{}, Index{}));
-    using Dst1Coord = decltype(make_tensor_coordinate(Dst1Desc{}, Index{}));
-
-    using SrcCoordStep  = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
-    using DstCoordStep  = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
-    using Dst0CoordStep = decltype(make_tensor_coordinate_step(Dst0Desc{}, Index{}));
-    using Dst1CoordStep = decltype(make_tensor_coordinate_step(Dst1Desc{}, Index{}));
-
-    __device__ constexpr ThreadwiseTensorSliceTransfer_v3r3(
-        const SrcDesc& src_desc,
-        const Index& src_slice_origin,
-        const SrcElementwiseOperation& src_element_op,
-        const DstDesc& dst_desc,
-        const Dst0Desc& dst0_desc,
-        const Dst1Desc& dst1_desc,
-        const Index& dst_slice_origin,
-        const DstElementwiseOperation& dst_element_op)
-        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
-          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
-          dst0_coord_(make_tensor_coordinate(dst0_desc, dst_slice_origin)),
-          dst1_coord_(make_tensor_coordinate(dst1_desc, dst_slice_origin)),
-          src_element_op_(src_element_op),
-          dst_element_op_(dst_element_op)
-    {
-    }
-
-    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
-    {
-        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
-    }
-
-    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc,
-                                      const Dst0Desc& dst0_desc,
-                                      const Dst1Desc& dst1_desc,
-                                      const Index& dst_slice_origin_idx)
-    {
-        dst_coord_  = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
-        dst0_coord_ = make_tensor_coordinate(dst0_desc, dst_slice_origin_idx);
-        dst1_coord_ = make_tensor_coordinate(dst1_desc, dst_slice_origin_idx);
-    }
-
-    template <typename SrcBuffer>
-    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
-    {
-        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
-                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
-                      "wrong!");
-
-        static_assert(
-            is_same<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>::value,
-            "wrong! SrcBuffer and SrcData data type are inconsistent");
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto src_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
-
-        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
-
-        constexpr auto ordered_src_access_lengths =
-            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
-
-        // make forward steps
-        const auto src_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(src_desc, forward_step_idx);
-            },
-            Number<nDim>{});
-
-        // make backward steps
-        const auto src_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(src_desc, backward_step_idx);
-            },
-            Number<nDim>{});
-
-        // loop over tensor and copy
-        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-                forward_sweep_(I0) = true;
-
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_src_access_idx[I0];
-
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
-                    });
-
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-
-                return forward_sweep_;
-            }();
-
-            // calculate src data index
-            constexpr auto src_data_idx = [&]() {
-                Index ordered_idx;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
-                                                      : ordered_src_access_lengths[i] - 1 -
-                                                            ordered_src_access_idx[i];
-                });
-
-                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
-                       src_scalar_per_access;
-            }();
-
-            constexpr auto src_data_idx_seq = generate_sequence_v2(
-                [&](auto i) { return Number<src_data_idx[i]>{}; }, Number<src_data_idx.Size()>{});
-
-            const bool is_src_valid =
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
-
-            using src_vector_type = vector_type_maker_t<SrcData, SrcScalarPerVector>;
-            using src_vector_t    = typename src_vector_type::type;
-
-            // copy data from src_buf into src_vector_container
-            auto src_vector_container = src_vector_type{
-                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
-
-            // apply SrcElementwiseOperation on src_vector_container
-            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
-                src_vector_container.template AsType<SrcData>()(i) =
-                    src_element_op_(src_vector_container.template AsType<SrcData>()[i]);
-            });
-
-            // copy data from src_vector_container into src_thread_scratch_
-            src_thread_scratch_.template SetAsType<src_vector_t>(
-                src_data_idx_seq, src_vector_container.template AsType<src_vector_t>()[I0]);
-
-            constexpr auto move_on_dim = [&]() constexpr
-            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
-
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &=
-                            ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
-                    });
-                });
-
-                return move_on_dim_;
-            }
-            ();
-
-            // move src coord
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
-                    }
-                }
-            });
-        });
-
-        // move src coordinate back to slice origin (or not)
-        if constexpr(SrcResetCoordinateAfterRun)
-        {
-            const auto src_reset_step =
-                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
-
-            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
-        }
-    }
-
-    __device__ void TransferDataFromSrcThreadScratchToDstThreadScratch()
-    {
-#if !CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE
-        static_ford<SliceLengths>{}([&](auto idx) {
-            // convert from SrcData to DstData here
-            dst_thread_scratch_(idx) = type_convert<DstData>(src_thread_scratch_[idx]);
-        });
-#else
-        // sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
-        // TODO make this logic more generic for more sub-dword datatype
-        if constexpr(SrcVectorDim != DstVectorDim &&
-                     is_same<half_t, remove_cvref_t<SrcData>>::value &&
-                     is_same<half_t, remove_cvref_t<DstData>>::value &&
-                     SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0)
-        {
-            // each transpose does
-            // DstScalarPerVector # of src vectors in src_thread_scratch_
-            // SrcScalarPerVector # of dst vectors in dst_thread_scratch_
-            constexpr index_t num_src_vector = Number<DstScalarPerVector>{};
-            constexpr index_t num_dst_vector = Number<SrcScalarPerVector>{};
-
-            // Assume SrcVectorDim is not the same as DstVectorDim, so we do transpose
-            // TODO: make this logic generic for all scenario
-            static_assert(SrcVectorDim != DstVectorDim, "wrong");
-
-            constexpr auto src_scalar_step_in_vector = generate_sequence(
-                detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
-
-            constexpr auto dst_scalar_step_in_vector = generate_sequence(
-                detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
-
-            constexpr auto scalar_per_access = generate_sequence(
-                detail::lambda_scalar_per_access_for_src_and_dst<SrcVectorDim,
-                                                                 SrcScalarPerVector,
-                                                                 DstVectorDim,
-                                                                 DstScalarPerVector>{},
-                Number<nDim>{});
-
-            constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
-
-            static_ford<decltype(access_lengths)>{}([&](auto access_idx) {
-                constexpr auto data_idx = access_idx * scalar_per_access;
-
-                constexpr auto data_idx_seq = generate_sequence_v2(
-                    [&](auto i) { return Number<data_idx[i]>{}; }, Number<nDim>{});
-
-                // TODO type_convert is not used yet!!!!!
-                using src_vector_t = vector_type_maker_t<SrcData, SrcScalarPerVector>;
-                using dst_vector_t = vector_type_maker_t<DstData, DstScalarPerVector>;
-
-                // get DstScalarPerVector # of read-only references to src vectors from
-                // src_thread_scratch_
-                const auto src_vector_refs = generate_tie(
-                    [&](auto i) -> const src_vector_t& {
-                        // i increment corresponds to movement in DstVectorDim
-                        return src_thread_scratch_.GetVectorTypeReference(
-                            data_idx_seq + i * dst_scalar_step_in_vector);
-                    },
-                    Number<num_src_vector>{});
-
-                // get SrcScalarPerVector # of references to dst vectors from dst_thread_scratch_
-                auto dst_vector_refs = generate_tie(
-                    [&](auto i) -> dst_vector_t& {
-                        // i increment corresponds to movement in SrcVectorDim
-                        return dst_thread_scratch_.GetVectorTypeReference(
-                            data_idx_seq + i * src_scalar_step_in_vector);
-                    },
-                    Number<num_dst_vector>{});
-
-                // do data transpose
-                // TODO type_convert is not used yet!!!!!
-                transpose_vectors<SrcData, DstScalarPerVector, SrcScalarPerVector>{}(
-                    src_vector_refs, dst_vector_refs);
-            });
-        }
-        else
-        {
-            static_ford<SliceLengths>{}([&](auto idx) {
-                // convert from SrcData to DstData here
-                dst_thread_scratch_(idx) = type_convert<DstData>(src_thread_scratch_[idx]);
-            });
-        }
-#endif
-    }
-
-    template <typename DstBuffer, typename Dst0Buffer, typename Dst1Buffer>
-    __device__ void RunWrite(const DstDesc& dst_desc,
-                             DstBuffer& dst_buf,
-                             const Dst0Desc& dst0_desc,
-                             const Dst0Buffer& dst0_buf,
-                             const Dst1Desc& dst1_desc,
-                             const Dst1Buffer& dst1_buf)
-    {
-        // if there is transpose, it's done here
-        // TODO move this elsewhere
-        TransferDataFromSrcThreadScratchToDstThreadScratch();
-
-        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
-                          DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
-                      "wrong!");
-
-        static_assert(
-            is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value,
-            "wrong! SrcBuffer or DstBuffer data type is wrong");
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-
-        // src scalar per access on each dim
-        // TODO: don't use this
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
-
-        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
-
-        constexpr auto ordered_dst_access_lengths =
-            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
-
-        // make forward steps
-        const auto dst_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(dst_desc, forward_step_idx);
-            },
-            Number<nDim>{});
-
-        // make forward steps: dst0
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst0_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(dst0_desc, forward_step_idx);
-            },
-            Number<nDim>{});
-
-        // make forward steps: dst1
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst1_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(dst1_desc, forward_step_idx);
-            },
-            Number<nDim>{});
-
-        // make backward steps
-        const auto dst_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(dst_desc, backward_step_idx);
-            },
-            Number<nDim>{});
-
-        // make backward steps: dst0
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst0_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(dst0_desc, backward_step_idx);
-            },
-            Number<nDim>{});
-
-        // make backward steps: dst1
-        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
-        // DstScalarPerVector
-        // TODO: fix this
-        const auto dst1_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(dst1_desc, backward_step_idx);
-            },
-            Number<nDim>{});
-
-        // loop over tensor and copy
-        static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-                forward_sweep_(I0) = true;
-
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_dst_access_idx[I0];
-
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
-                    });
-
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-
-                return forward_sweep_;
-            }();
-
-            // calculate dst data index
-            constexpr auto dst_data_idx = [&]() {
-                Index ordered_idx;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
-                                                      : ordered_dst_access_lengths[i] - 1 -
-                                                            ordered_dst_access_idx[i];
-                });
-
-                return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
-                       dst_scalar_per_access;
-            }();
-
-            constexpr auto dst_data_idx_seq = generate_sequence_v2(
-                [&](auto i) { return Number<dst_data_idx[i]>{}; }, Number<dst_data_idx.Size()>{});
-
-            const bool is_dst_valid =
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
-
-            using dst_vector_type = vector_type_maker_t<DstData, DstScalarPerVector>;
-            using dst_vector_t    = typename dst_vector_type::type;
-
-            // copy data from dst_thread_scratch_ into dst_vector_container
-            auto dst_vector_container = dst_vector_type{
-                dst_thread_scratch_.template GetAsType<dst_vector_t>(dst_data_idx_seq)};
-
-            // apply DstElementwiseOperation on dst_vector_container
-            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
-                dst_vector_container.template AsType<DstData>()(i) =
-                    dst_element_op_(dst_vector_container.template AsType<DstData>()[i]);
-            });
-
-            // copy data from dst_vector_container to dst_buf
-            dst_buf.template Set<dst_vector_t>(
-                dst_coord_.GetOffset(),
-                is_dst_valid,
-                dst_vector_container.template AsType<dst_vector_t>()[I0]);
-
-            constexpr auto move_on_dim = [&]() constexpr
-            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
-
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &=
-                            ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
-                    });
-                });
-
-                return move_on_dim_;
-            }
-            ();
-
-            // move dst coord
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
-                    }
-                }
-            });
-        });
-
-        // move dst coordinate back to slice origin (or not)
-        if constexpr(DstResetCoordinateAfterRun)
-        {
-            const auto dst_reset_step =
-                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
-
-            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
-        }
-    }
-
-    __device__ static constexpr auto GetSrcCoordinateResetStep()
-    {
-        constexpr auto I0 = Number<0>{};
-
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto src_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
-
-        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
-
-        constexpr auto ordered_src_access_lengths =
-            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
-
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-            forward_sweep_(I0) = true;
-
-            // TODO: BUG: should start at 1
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_src_access_lengths[I0] - 1;
-
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
-                });
-
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-
-            return forward_sweep_;
-        }();
-
-        // calculate src data index after last iteration in RunRead(), if it has not being reset by
-        // RunRead()
-        constexpr auto src_data_idx = [&]() {
-            Index ordered_idx;
-
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
-            });
-
-            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
-                   src_scalar_per_access;
-        }();
-
-        //
-        constexpr auto reset_src_data_step = [&]() {
-            Index reset_src_data_step_;
-
-            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
-
-            return reset_src_data_step_;
-        }();
-
-        return reset_src_data_step;
-    }
-
-    __device__ static constexpr auto GetDstCoordinateResetStep()
-    {
-        constexpr auto I0 = Number<0>{};
-
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
-
-        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
-
-        constexpr auto ordered_dst_access_lengths =
-            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
-
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-            forward_sweep_(I0) = true;
-
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_dst_access_lengths[I0] - 1;
-
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
-                });
-
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-
-            return forward_sweep_;
-        }();
-
-        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
-        // RunWrite()
-        constexpr auto dst_data_idx = [&]() {
-            Index ordered_idx;
-
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
-            });
-
-            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
-                   dst_scalar_per_access;
-        }();
-
-        //
-        constexpr auto reset_dst_data_step = [&]() {
-            Index reset_dst_data_step_;
-
-            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
-
-            return reset_dst_data_step_;
-        }();
-
-        return reset_dst_data_step;
-    }
-
-    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
-                                       const Index& src_slice_origin_step_idx)
-    {
-        // if src coord was not reset by RunRead(), then need to adjust the step here
-        const auto adjusted_step_idx =
-            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
-                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
-
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
-
-        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
-    }
-
-    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
-                                       const Index& src_slice_origin_step_idx)
-    {
-        // if src coord was not reset by RunRead(), then need to adjust the step here
-        const auto adjusted_step_idx =
-            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
-                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
-
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
-
-        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
-    }
-
-    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
-                                       const Dst0Desc dst0_desc,
-                                       const Dst1Desc dst1_desc,
-                                       const Index& dst_slice_origin_step_idx)
-    {
-        // if dst coord was not reset by RunWrite(), then need to adjust the step here
-        const auto adjusted_step_idx =
-            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
-                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
-
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
-
-        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
-        move_tensor_coordinate(dst0_desc, dst0_coord_, adjusted_step);
-        move_tensor_coordinate(dst1_desc, dst1_coord_, adjusted_step);
-    }
-
-    __device__ static constexpr auto GetSrcThreadScratchDescriptor()
-    {
-        constexpr auto src_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
-
-        constexpr auto src_access_lengths_and_vector_length = container_push_back(
-            sequence_to_tuple_of_number(src_access_lengths), Number<SrcScalarPerVector>{});
-
-        // 1st stage of transforms
-        constexpr auto desc0 =
-            make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length);
-
-        // 2nd stage of transforms
-        constexpr auto transforms = generate_tuple(
-            [&](auto i) {
-                if constexpr(i == SrcVectorDim)
-                {
-                    return make_merge_transform_v3_division_mod(
-                        make_tuple(src_access_lengths_and_vector_length[i],
-                                   src_access_lengths_and_vector_length[Number<nDim>{}]));
-                }
-                else
-                {
-                    return make_pass_through_transform(src_access_lengths_and_vector_length[i]);
-                }
-            },
-            Number<nDim>{});
-
-        constexpr auto low_dim_idss = generate_tuple(
-            [&](auto i) {
-                if constexpr(i == SrcVectorDim)
-                {
-                    return Sequence<i.value, nDim>{};
-                }
-                else
-                {
-                    return Sequence<i.value>{};
-                }
-            },
-            Number<nDim>{});
-
-        constexpr auto up_dim_idss =
-            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
-
-        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
-    }
-
-    __device__ static constexpr auto GetDstThreadScratchDescriptor()
-    {
-        // 1st stage of transforms
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
-
-        constexpr auto dst_access_lengths_and_vector_length = container_push_back(
-            sequence_to_tuple_of_number(dst_access_lengths), Number<DstScalarPerVector>{});
-
-        constexpr auto desc0 =
-            make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length);
-
-        // 2nd stage of transforms
-        constexpr auto transforms = generate_tuple(
-            [&](auto i) {
-                if constexpr(i == DstVectorDim)
-                {
-                    return make_merge_transform_v3_division_mod(
-                        make_tuple(dst_access_lengths_and_vector_length[i],
-                                   dst_access_lengths_and_vector_length[Number<nDim>{}]));
-                }
-                else
-                {
-                    return make_pass_through_transform(dst_access_lengths_and_vector_length[i]);
-                }
-            },
-            Number<nDim>{});
-
-        constexpr auto low_dim_idss = generate_tuple(
-            [&](auto i) {
-                if constexpr(i == DstVectorDim)
-                {
-                    return Sequence<i.value, nDim>{};
-                }
-                else
-                {
-                    return Sequence<i.value>{};
-                }
-            },
-            Number<nDim>{});
-
-        constexpr auto up_dim_idss =
-            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
-
-        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
-    }
-
-    private:
-    static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){};
-    static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){};
-
-    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
-                                    SrcData,
-                                    SrcScalarPerVector,
-                                    decltype(src_thread_scratch_desc_),
-                                    true>
-        src_thread_scratch_;
-
-    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
-                                    DstData,
-                                    DstScalarPerVector,
-                                    decltype(dst_thread_scratch_desc_),
-                                    true>
-        dst_thread_scratch_;
-
-    SrcCoord src_coord_;
-    DstCoord dst_coord_;
-    const SrcElementwiseOperation src_element_op_;
-    const DstElementwiseOperation dst_element_op_;
-};
-
-} // namespace ck
-#endif
diff --git a/include/ck/utility/amd_llvm_intrinsic.hpp b/include/ck/utility/amd_llvm_intrinsic.hpp
deleted file mode 100644
index 01e77d7be..000000000
--- a/include/ck/utility/amd_llvm_intrinsic.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_AMD_LLVM_INTRINSIC_HPP
-#define CK_AMD_LLVM_INTRINSIC_HPP
-
-#include "data_type.hpp"
-
-namespace ck {
-
-__device__ int32_t llvm_amdgcn_readfirstlane_i32(int32_t i) __asm("llvm.amdgcn.readfirstlane");
-
-} // namespace ck
-#endif
diff --git a/include/ck/utility/print.hpp b/include/ck/utility/print.hpp
deleted file mode 100644
index eed1ca42c..000000000
--- a/include/ck/utility/print.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#ifndef CK_PRINT_HPP
-#define CK_PRINT_HPP
-
-#include "array.hpp"
-#include "statically_indexed_array.hpp"
-#include "container_helper.hpp"
-#include "sequence.hpp"
-
-namespace ck {
-
-template <typename T>
-__host__ __device__ void print_array(const char* s, T a)
-{
-    constexpr index_t nsize = a.Size();
-
-    printf("%s size %d, {", s, nsize);
-    static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", int32_t{a[i]}); });
-    printf("}\n");
-}
-
-} // namespace ck
-#endif
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
deleted file mode 100644
index c77d22f4c..000000000
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
+++ /dev/null
@@ -1,136 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace host {
-
-template <typename ADataType,
-          typename BDataType,
-          typename C0DataType,
-          typename CDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-struct ReferenceGemmBias2D : public device::BaseOperator
-{
-    // Argument
-    struct Argument : public device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_m_k,
-                 const Tensor<BDataType>& b_k_n,
-                 const Tensor<C0DataType>& c0_m_n,
-                 Tensor<CDataType>& c_m_n,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-            : a_m_k_{a_m_k},
-              b_k_n_{b_k_n},
-              c0_m_n_{c0_m_n},
-              c_m_n_{c_m_n},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_m_k_;
-        const Tensor<BDataType>& b_k_n_;
-        const Tensor<CDataType>& c0_m_n_;
-        Tensor<CDataType>& c_m_n_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public device::BaseInvoker
-    {
-        using Argument = ReferenceGemmBias2D::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_mk_kn_mn = [&](auto m, auto n) {
-                const int K = arg.a_m_k_.mDesc.GetLengths()[1];
-
-                AccDataType a   = 0;
-                AccDataType b   = 0;
-                AccDataType acc = 0;
-
-                for(int k = 0; k < K; ++k)
-                {
-                    arg.a_element_op_(a, ck::type_convert<AccDataType>(arg.a_m_k_(m, k)));
-                    arg.b_element_op_(b, ck::type_convert<AccDataType>(arg.b_k_n_(k, n)));
-                    acc += a * b;
-                }
-
-                CDataType cast_acc = static_cast<CDataType>(acc);
-                arg.c_element_op_(arg.c_m_n_(m, n), cast_acc, arg.c0_m_n_(m, n));
-            };
-
-            make_ParallelTensorFunctor(
-                f_mk_kn_mn, arg.c_m_n_.mDesc.GetLengths()[0], arg.c_m_n_.mDesc.GetLengths()[1])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_m_k,
-                             const Tensor<BDataType>& b_k_n,
-                             const Tensor<C0DataType>& c0_m_n,
-                             Tensor<CDataType>& c_m_n,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
-    {
-        return Argument{a_m_k, b_k_n, c0_m_n, c_m_n, a_element_op, b_element_op, c_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceGemmBias2D"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
-} // namespace host
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
deleted file mode 100644
index 7dfc3c1ed..000000000
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
+++ /dev/null
@@ -1,140 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
-
-#include "ck/library/utility/host_tensor.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace host {
-
-template <typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-struct ReferenceGemmBiasActivation : public device::BaseOperator
-{
-    // Argument
-    struct Argument : public device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_m_k,
-                 const Tensor<BDataType>& b_k_n,
-                 Tensor<CDataType>& c_m_n,
-                 const Tensor<CDataType>& c0_n,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-            : a_m_k_{a_m_k},
-              b_k_n_{b_k_n},
-              c_m_n_{c_m_n},
-              c0_n_{c0_n},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_m_k_;
-        const Tensor<BDataType>& b_k_n_;
-        Tensor<CDataType>& c_m_n_;
-        const Tensor<CDataType>& c0_n_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public device::BaseInvoker
-    {
-        using Argument = ReferenceGemmBiasActivation::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_mk_kn_mn = [&](auto m, auto n) {
-                const int K = arg.a_m_k_.mDesc.GetLengths()[1];
-
-                float v_acc = 0;
-
-                for(int k = 0; k < K; ++k)
-                {
-                    float v_a;
-                    float v_b;
-
-                    arg.a_element_op_(v_a, static_cast<const float>(arg.a_m_k_(m, k)));
-                    arg.b_element_op_(v_b, static_cast<const float>(arg.b_k_n_(k, n)));
-
-                    v_acc += v_a * v_b;
-                }
-
-                float v_c;
-
-                arg.c_element_op_(v_c, v_acc, static_cast<float>(arg.c0_n_(n)));
-
-                arg.c_m_n_(m, n) = v_c;
-            };
-
-            make_ParallelTensorFunctor(
-                f_mk_kn_mn, arg.c_m_n_.mDesc.GetLengths()[0], arg.c_m_n_.mDesc.GetLengths()[1])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_m_k,
-                             const Tensor<BDataType>& b_k_n,
-                             Tensor<CDataType>& c_m_n,
-                             const Tensor<CDataType>& c0_n,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
-    {
-        return Argument{a_m_k, b_k_n, c_m_n, c0_n, a_element_op, b_element_op, c_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceGemmBiasActivation"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
-} // namespace host
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
deleted file mode 100644
index 99102a40d..000000000
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
+++ /dev/null
@@ -1,148 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
-
-#include "ck/library/utility/host_tensor.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace host {
-
-template <typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-struct ReferenceGemmBiasActivationAdd : public device::BaseOperator
-{
-    // Argument
-    struct Argument : public device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_m_k,
-                 const Tensor<BDataType>& b_k_n,
-                 Tensor<CDataType>& c_m_n,
-                 const Tensor<CDataType>& c0_n,
-                 const Tensor<CDataType>& c1_m_n,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-            : a_m_k_{a_m_k},
-              b_k_n_{b_k_n},
-              c_m_n_{c_m_n},
-              c0_n_{c0_n},
-              c1_m_n_{c1_m_n},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_m_k_;
-        const Tensor<BDataType>& b_k_n_;
-        Tensor<CDataType>& c_m_n_;
-        const Tensor<CDataType>& c0_n_;
-        const Tensor<CDataType>& c1_m_n_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public device::BaseInvoker
-    {
-        using Argument = ReferenceGemmBiasActivationAdd::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_mk_kn_mn = [&](auto m, auto n) {
-                const int K = arg.a_m_k_.mDesc.GetLengths()[1];
-
-                float v_acc = 0;
-
-                for(int k = 0; k < K; ++k)
-                {
-                    float v_a;
-                    float v_b;
-
-                    arg.a_element_op_(v_a, static_cast<const float>(arg.a_m_k_(m, k)));
-                    arg.b_element_op_(v_b, static_cast<const float>(arg.b_k_n_(k, n)));
-
-                    v_acc += v_a * v_b;
-                }
-
-                float v_c;
-
-                arg.c_element_op_(v_c,
-                                  v_acc,
-                                  static_cast<float>(arg.c0_n_(n)),
-                                  static_cast<float>(arg.c1_m_n_(m, n)));
-
-                arg.c_m_n_(m, n) = v_c;
-            };
-
-            make_ParallelTensorFunctor(
-                f_mk_kn_mn, arg.c_m_n_.mDesc.GetLengths()[0], arg.c_m_n_.mDesc.GetLengths()[1])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_m_k,
-                             const Tensor<BDataType>& b_k_n,
-                             Tensor<CDataType>& c_m_n,
-                             const Tensor<CDataType>& c0_n,
-                             const Tensor<CDataType>& c1_m_n,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
-    {
-        return Argument{
-            a_m_k, b_k_n, c_m_n, c0_n, c1_m_n, a_element_op, b_element_op, c_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceGemmBiasActivationAdd"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
-} // namespace host
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
index 0655fd92e..bb5f971c7 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
index 495c5f884..0b025b33c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
index 0aa7a5aa3..593ef7cb9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
index a6dcfa30d..e1a4391c4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
index 89df1a7a0..34c86dd44 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
index c116d999d..6a551c726 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
@@ -3,10 +3,8 @@
 
 #pragma once
 
-#include <cstdlib>
 #include <vector>
 #include <memory>
-
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
index e3f07606c..fc9ec8d61 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
@@ -3,10 +3,8 @@
 
 #pragma once
 
-#include <cstdlib>
 #include <vector>
 #include <memory>
-
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
index ec5d18fc2..07d552476 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
index 62f28c9b1..2c529e06f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
index 381a015eb..8af400cb7 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
@@ -3,8 +3,7 @@
 
 #pragma once
 
-#include <cstdlib>
-
+#include <vector>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp
index 682f54675..3d0c34062 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp
@@ -4,7 +4,7 @@
 #pragma once
 
 #include <cstdlib>
-
+#include <vector>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
index e230507e7..0a8f2215b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -3,10 +3,8 @@
 
 #pragma once
 
-#include <cstdlib>
 #include <memory>
 #include <vector>
-
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
index 90b6e11b9..2d578cca4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
@@ -3,10 +3,8 @@
 
 #pragma once
 
-#include <cstdlib>
 #include <vector>
 #include <memory>
-
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
index ef70504f2..8f4cd4d96 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
@@ -3,10 +3,8 @@
 
 #pragma once
 
-#include <cstdlib>
 #include <vector>
 #include <memory>
-
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
index 8986a7934..e88844694 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index 175932e63..fc4beb0ae 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -4,7 +4,7 @@
 #pragma once
 
 #include <vector>
-
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
index e38dad165..e97484a5a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
index 55c67b762..199ed73b4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <cstdlib>
-
+#include <vector>
+#include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_normalization.hpp"
diff --git a/library/include/ck/library/utility/host_conv.hpp b/library/include/ck/library/utility/host_conv.hpp
deleted file mode 100644
index 8348a3089..000000000
--- a/library/include/ck/library/utility/host_conv.hpp
+++ /dev/null
@@ -1,152 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-#include "host_tensor.hpp"
-#include "conv_common.hpp"
-
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void host_conv_nchw_kcyx_nkhw(const Tensor<TIn>& in,
-                              const Tensor<TWei>& wei,
-                              Tensor<TOut>& out,
-                              const ConvStrides& conv_strides,
-                              const ConvDilations& conv_dilations,
-                              const InLeftPads& in_left_pads,
-                              const InRightPads&)
-{
-    constexpr auto I0 = ck::Number<0>{};
-    constexpr auto I1 = ck::Number<1>{};
-
-    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-        float v = 0;
-        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
-        {
-            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
-            {
-                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
-                {
-                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[3])
-                    {
-                        v += ck::type_convert<float>(in(n, c, hi, wi)) *
-                             ck::type_convert<float>(wei(k, c, y, x));
-                    }
-                }
-            }
-        }
-        out(n, k, ho, wo) = ck::type_convert<TOut>(v);
-    };
-
-    make_ParallelTensorFunctor(f_nchw,
-                               out.mDesc.GetLengths()[0],
-                               out.mDesc.GetLengths()[1],
-                               out.mDesc.GetLengths()[2],
-                               out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-}
-
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void host_conv3d_ndhwc_kzyxc_ndhwk(const Tensor<TIn>& in,
-                                   const Tensor<TWei>& wei,
-                                   Tensor<TOut>& out,
-                                   const ConvStrides& conv_strides,
-                                   const ConvDilations& conv_dilations,
-                                   const InLeftPads& in_left_pads,
-                                   const InRightPads&)
-{
-    using namespace ck;
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    const auto Di     = in.mDesc.GetLengths()[1];
-    const auto Hi     = in.mDesc.GetLengths()[2];
-    const auto Wi     = in.mDesc.GetLengths()[3];
-    const auto Z      = wei.mDesc.GetLengths()[1];
-    const auto Y      = wei.mDesc.GetLengths()[2];
-    const auto X      = wei.mDesc.GetLengths()[3];
-    const auto C      = wei.mDesc.GetLengths()[4];
-
-    auto f_ndhwc = [&](auto n, auto do_tmp, auto ho_tmp, auto wo_tmp, auto k) {
-        // do__ must be converted to signed integer, otherwise zmin might be wrong in cases
-        // negative values.
-        const int do_ = static_cast<int>(do_tmp);
-        const int ho  = static_cast<int>(ho_tmp);
-        const int wo  = static_cast<int>(wo_tmp);
-        const int zmin =
-            std::max(0,
-                     (in_left_pads[I0] - do_ * conv_strides[I0] + conv_dilations[I0] - 1) /
-                         conv_dilations[I0]);
-        const int ymin =
-            std::max(0,
-                     (in_left_pads[I1] - ho * conv_strides[I1] + conv_dilations[I1] - 1) /
-                         conv_dilations[I1]);
-        const int xmin =
-            std::max(0,
-                     (in_left_pads[I2] - wo * conv_strides[I2] + conv_dilations[I2] - 1) /
-                         conv_dilations[I2]);
-        const int zmax =
-            std::min(Z, (in_left_pads[I0] - do_ * conv_strides[I0] + Di) / conv_dilations[I0]);
-        const int ymax =
-            std::min(Y, (in_left_pads[I1] - ho * conv_strides[I1] + Hi) / conv_dilations[I1]);
-        const int xmax =
-            std::min(X, (in_left_pads[I2] - wo * conv_strides[I2] + Wi) / conv_dilations[I2]);
-        const int di_min = do_ * conv_strides[I0] + zmin * conv_dilations[I0] - in_left_pads[I0];
-        const int hi_min = ho * conv_strides[I1] + ymin * conv_dilations[I1] - in_left_pads[I1];
-        const int wi_min = wo * conv_strides[I2] + xmin * conv_dilations[I2] - in_left_pads[I2];
-
-        double v = 0;
-
-        const TIn* in_n   = in.mData.data() + n * Di * Hi * Wi * C;
-        const TWei* wei_k = wei.mData.data() + k * Z * Y * X * C;
-
-        int di = di_min;
-        for(int z = zmin; z < zmax; ++z, di += conv_dilations[I0])
-        {
-            const TIn* in_n_di  = in_n + di * Hi * Wi * C;
-            const TWei* wei_k_z = wei_k + z * Y * X * C;
-            int hi              = hi_min;
-
-            for(int y = ymin; y < ymax; ++y, hi += conv_dilations[I1])
-            {
-                const TIn* in_n_di_hi = in_n_di + hi * Wi * C;
-                const TWei* wei_k_z_y = wei_k_z + y * X * C;
-                int wi                = wi_min;
-
-                for(int x = xmin; x < xmax; ++x, wi += conv_dilations[I2])
-                {
-                    const TIn* in_n_di_hi_wi = in_n_di_hi + wi * C;
-                    const TWei* wei_k_z_y_x  = wei_k_z_y + x * C;
-
-                    for(int c = 0; c < C; ++c)
-                    {
-                        v += static_cast<const double>(in_n_di_hi_wi[c]) *
-                             static_cast<const double>(wei_k_z_y_x[c]);
-                    }
-                }
-            }
-        }
-
-        out(n, do_, ho, wo, k) = v;
-    };
-
-    make_ParallelTensorFunctor(f_ndhwc,
-                               out.mDesc.GetLengths()[0],
-                               out.mDesc.GetLengths()[1],
-                               out.mDesc.GetLengths()[2],
-                               out.mDesc.GetLengths()[3],
-                               out.mDesc.GetLengths()[4])(std::thread::hardware_concurrency() - 4);
-}
diff --git a/library/include/ck/library/utility/op_instance_engine.hpp b/library/include/ck/library/utility/op_instance_engine.hpp
deleted file mode 100644
index 78812e8c8..000000000
--- a/library/include/ck/library/utility/op_instance_engine.hpp
+++ /dev/null
@@ -1,249 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <memory>
-#include <stdexcept>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "ck/utility/functional2.hpp"
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-
-namespace ck {
-namespace utils {
-
-struct ProfileBestConfig
-{
-    std::string best_op_name;
-    float best_avg_time   = std::numeric_limits<float>::max();
-    float best_tflops     = std::numeric_limits<float>::max();
-    float best_gb_per_sec = std::numeric_limits<float>::max();
-};
-
-/**
- * @brief      This class describes an operation instance(s).
- *
- *             Op instance defines a particular specializations of operator
- *             template. Thanks to this specific input/output data types, data
- *             layouts and modifying elementwise operations it is able to create
- *             it's input/output tensors, provide pointers to instances which
- *             can execute it and all operation specific parameters.
- */
-template <typename OutDataType, typename... InArgTypes>
-class OpInstance
-{
-    public:
-    template <typename T>
-    using TensorPtr      = std::unique_ptr<Tensor<T>>;
-    using InTensorsTuple = std::tuple<TensorPtr<InArgTypes>...>;
-    using DeviceMemPtr   = std::unique_ptr<DeviceMem>;
-    using DeviceBuffers  = std::vector<DeviceMemPtr>;
-
-    OpInstance()                  = default;
-    OpInstance(const OpInstance&) = default;
-    OpInstance& operator=(const OpInstance&) = default;
-    virtual ~OpInstance(){};
-
-    virtual InTensorsTuple GetInputTensors() const         = 0;
-    virtual TensorPtr<OutDataType> GetOutputTensor() const = 0;
-    virtual std::unique_ptr<tensor_operation::device::BaseInvoker>
-    MakeInvokerPointer(tensor_operation::device::BaseOperator*) const = 0;
-    virtual std::unique_ptr<tensor_operation::device::BaseArgument>
-    MakeArgumentPointer(tensor_operation::device::BaseOperator*,
-                        const DeviceBuffers&,
-                        const DeviceMemPtr&) const = 0;
-    virtual std::size_t GetFlops() const           = 0;
-    virtual std::size_t GetBtype() const           = 0;
-};
-
-/**
- * @brief      A generic operation instance run engine.
- */
-template <typename OutDataType, typename... InArgTypes>
-class OpInstanceRunEngine
-{
-    public:
-    using OpInstanceT = OpInstance<InArgTypes..., OutDataType>;
-    template <typename T>
-    using TensorPtr        = std::unique_ptr<Tensor<T>>;
-    using DeviceMemPtr     = std::unique_ptr<DeviceMem>;
-    using InTensorsTuple   = std::tuple<TensorPtr<InArgTypes>...>;
-    using DeviceBuffers    = std::vector<DeviceMemPtr>;
-    using InArgsTypesTuple = std::tuple<InArgTypes...>;
-
-    OpInstanceRunEngine() = delete;
-
-    template <typename ReferenceOp = std::function<void()>>
-    OpInstanceRunEngine(const OpInstanceT& op_instance,
-                        const ReferenceOp& reference_op = ReferenceOp{},
-                        bool do_verification            = true)
-        : op_instance_{op_instance}
-    {
-        in_tensors_ = op_instance_.GetInputTensors();
-        out_tensor_ = op_instance_.GetOutputTensor();
-
-        if constexpr(std::is_invocable_v<ReferenceOp,
-                                         const Tensor<InArgTypes>&...,
-                                         Tensor<OutDataType>&>)
-        {
-            if(do_verification)
-            {
-                ref_output_ = op_instance_.GetOutputTensor();
-                CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
-            }
-        }
-        AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{});
-        out_device_buffer_ = std::make_unique<DeviceMem>(sizeof(OutDataType) *
-                                                         out_tensor_->mDesc.GetElementSpaceSize());
-        out_device_buffer_->SetZero();
-    }
-
-    virtual ~OpInstanceRunEngine(){};
-
-    template <typename OpInstancePtr>
-    bool Test(const std::vector<OpInstancePtr>& op_ptrs)
-    {
-        bool res{true};
-        for(auto& op_ptr : op_ptrs)
-        {
-            auto invoker  = op_instance_.MakeInvokerPointer(op_ptr.get());
-            auto argument = op_instance_.MakeArgumentPointer(
-                op_ptr.get(), in_device_buffers_, out_device_buffer_);
-            if(op_ptr->IsSupportedArgument(argument.get()))
-            {
-                std::cout << "Testing instance: " << op_ptr->GetTypeString() << std::endl;
-                invoker->Run(argument.get());
-                out_device_buffer_->FromDevice(out_tensor_->mData.data());
-                if(!ref_output_)
-                {
-                    throw std::runtime_error(
-                        "OpInstanceRunEngine::Test: Reference value not availabe."
-                        " You have to provide reference function.");
-                }
-                // TODO: enable flexible use of custom check_error functions
-                bool inst_res = CheckErr(out_tensor_->mData, ref_output_->mData);
-                std::cout << (inst_res ? "SUCCESS" : "FAILURE") << std::endl;
-                res = res && inst_res;
-                out_device_buffer_->SetZero();
-            }
-            else
-            {
-                std::cout << "Given conv problem is not supported by instance: \n\t>>>>"
-                          << op_ptr->GetTypeString() << std::endl;
-            }
-        }
-        return res;
-    }
-
-    template <typename OpInstancePtr>
-    ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
-                              bool time_kernel     = false,
-                              bool do_verification = false,
-                              bool do_log          = false)
-    {
-        ProfileBestConfig best_config;
-
-        for(auto& op_ptr : op_ptrs)
-        {
-            auto invoker  = op_instance_.MakeInvokerPointer(op_ptr.get());
-            auto argument = op_instance_.MakeArgumentPointer(
-                op_ptr.get(), in_device_buffers_, out_device_buffer_);
-            if(op_ptr->IsSupportedArgument(argument.get()))
-            {
-                std::string op_name = op_ptr->GetTypeString();
-                float avg_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
-
-                std::size_t flops     = op_instance_.GetFlops();
-                std::size_t num_btype = op_instance_.GetBtype();
-                float tflops          = static_cast<float>(flops) / 1.E9 / avg_time;
-                float gb_per_sec      = num_btype / 1.E6 / avg_time;
-
-                std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                          << " GB/s, " << op_name << std::endl;
-
-                if(avg_time < best_config.best_avg_time)
-                {
-                    best_config.best_op_name    = op_name;
-                    best_config.best_tflops     = tflops;
-                    best_config.best_gb_per_sec = gb_per_sec;
-                    best_config.best_avg_time   = avg_time;
-                }
-
-                if(do_verification)
-                {
-                    out_device_buffer_->FromDevice(out_tensor_->mData.data());
-                    if(!ref_output_)
-                    {
-                        throw std::runtime_error(
-                            "OpInstanceRunEngine::Profile: Reference value not availabe."
-                            " You have to provide reference function.");
-                    }
-                    // TODO: enable flexible use of custom check_error functions
-                    CheckErr(out_tensor_->mData, ref_output_->mData);
-
-                    if(do_log) {}
-                }
-                out_device_buffer_->SetZero();
-            }
-        }
-        return best_config;
-    }
-
-    void SetAtol(double a) { atol_ = a; }
-    void SetRtol(double r) { rtol_ = r; }
-
-    private:
-    template <typename F, std::size_t... Is>
-    void CallRefOpUnpackArgs(const F& f, std::index_sequence<Is...>) const
-    {
-        f(*std::get<Is>(in_tensors_)..., *ref_output_);
-    }
-
-    template <std::size_t... Is>
-    void AllocateDeviceInputTensors(std::index_sequence<Is...>)
-    {
-        (AllocateDeviceInputTensorsImpl<Is>(), ...);
-    }
-
-    template <std::size_t Index>
-    void AllocateDeviceInputTensorsImpl()
-    {
-        const auto& ts = std::get<Index>(in_tensors_);
-        in_device_buffers_
-            .emplace_back(
-                std::make_unique<DeviceMem>(sizeof(std::tuple_element_t<Index, InArgsTypesTuple>) *
-                                            ts->mDesc.GetElementSpaceSize()))
-            ->ToDevice(ts->mData.data());
-    }
-
-    static constexpr std::size_t kNInArgs_ = std::tuple_size_v<InTensorsTuple>;
-    const OpInstanceT& op_instance_;
-    double rtol_{1e-5};
-    double atol_{1e-8};
-
-    InTensorsTuple in_tensors_;
-    TensorPtr<OutDataType> out_tensor_;
-    TensorPtr<OutDataType> ref_output_;
-
-    DeviceBuffers in_device_buffers_;
-    DeviceMemPtr out_device_buffer_;
-
-    template <typename T>
-    bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const
-    {
-        return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", rtol_, atol_);
-    }
-};
-
-} // namespace utils
-} // namespace ck
diff --git a/profiler/include/profiler/data_type_enum_helper.hpp b/profiler/include/profiler/data_type_enum_helper.hpp
deleted file mode 100644
index d9bd5e1a4..000000000
--- a/profiler/include/profiler/data_type_enum_helper.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma
-
-#include "ck/utility/data_type.hpp"
-#include "profiler/data_type_enum.hpp"
-
-namespace ck {
-
-template <DataTypeEnum DataTypeEnum>
-struct get_datatype_from_enum;
-
-template <>
-struct get_datatype_from_enum<DataTypeEnum::Int8>
-{
-    using type = int8_t;
-};
-
-template <>
-struct get_datatype_from_enum<DataTypeEnum::Int32>
-{
-    using type = int32_t;
-};
-
-template <>
-struct get_datatype_from_enum<DataTypeEnum::Half>
-{
-    using type = half_t;
-};
-
-template <>
-struct get_datatype_from_enum<DataTypeEnum::Float>
-{
-    using type = float;
-};
-
-template <>
-struct get_datatype_from_enum<DataTypeEnum::Double>
-{
-    using type = double;
-};
-
-template <typename T>
-struct get_datatype_enum_from_type;
-
-template <>
-struct get_datatype_enum_from_type<int8_t>
-{
-    static constexpr DataTypeEnum value = DataTypeEnum::Int8;
-};
-
-template <>
-struct get_datatype_enum_from_type<int32_t>
-{
-    static constexpr DataTypeEnum value = DataTypeEnum::Int32;
-};
-
-template <>
-struct get_datatype_enum_from_type<half_t>
-{
-    static constexpr DataTypeEnum value = DataTypeEnum::Half;
-};
-
-template <>
-struct get_datatype_enum_from_type<float>
-{
-    static constexpr DataTypeEnum value = DataTypeEnum::Float;
-};
-
-template <>
-struct get_datatype_enum_from_type<double>
-{
-    static constexpr DataTypeEnum value = DataTypeEnum::Double;
-};
-
-} // namespace ck
diff --git a/profiler/include/profiler/profile_convnd_bwd_data_impl.hpp b/profiler/include/profiler/profile_convnd_bwd_data_impl.hpp
deleted file mode 100644
index 1e69ebc8b..000000000
--- a/profiler/include/profiler/profile_convnd_bwd_data_impl.hpp
+++ /dev/null
@@ -1,486 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/conv_util.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
-
-using F16  = ck::half_t;
-using F32  = float;
-using BF16 = ck::bhalf_t;
-using INT8 = int8_t;
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using DeviceConvBwdDataNoOpPtr =
-    DeviceConvBwdDataPtr<ck::tensor_operation::element_wise::PassThrough,
-                         ck::tensor_operation::element_wise::PassThrough,
-                         ck::tensor_operation::element_wise::PassThrough>;
-void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-
-void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-
-void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(
-    std::vector<DeviceConvBwdDataNoOpPtr>&);
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-
-namespace ck {
-namespace profiler {
-using DeviceConvBwdDataNoOpPtr = ck::tensor_operation::device::instance::DeviceConvBwdDataNoOpPtr;
-
-template <typename InLayout>
-HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::size_t>& dims,
-                                                      int num_dim_spatial = 2)
-{
-    namespace tl = ck::tensor_layout::convolution;
-
-    switch(num_dim_spatial)
-    {
-    case 3: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
-    }
-    case 2: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
-    }
-    case 1: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
-    }
-    default: {
-        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
-    }
-    }
-}
-template <typename WeiLayout>
-HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::size_t>& dims,
-                                                        int num_dim_spatial = 2)
-{
-    namespace tl = ck::tensor_layout::convolution;
-
-    switch(num_dim_spatial)
-    {
-    case 3: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
-    }
-    case 2: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
-    }
-    case 1: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
-    }
-    default: {
-        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
-    }
-    }
-}
-template <typename OutLayout>
-HostTensorDescriptor get_output_host_ensor_descriptor(const std::vector<std::size_t>& dims,
-                                                      int num_dim_spatial = 2)
-{
-    namespace tl = ck::tensor_layout::convolution;
-
-    switch(num_dim_spatial)
-    {
-    case 3: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
-    }
-    case 2: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
-    }
-    case 1: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
-    }
-    default: {
-        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
-    }
-    }
-}
-template <typename InDataType, typename WeiDataType, typename OutDataType>
-void get_device_conv_bwd_data_op_ptr(
-    InDataType, WeiDataType, OutDataType, std::vector<DeviceConvBwdDataNoOpPtr>&, int)
-{
-    std::cout << "can not find device conv bwd data" << std::endl;
-    exit(1);
-}
-template <>
-void get_device_conv_bwd_data_op_ptr(
-    F32, F32, F32, std::vector<DeviceConvBwdDataNoOpPtr>& conv_ptrs, int num_dim_spatial)
-{
-    switch(num_dim_spatial)
-    {
-    case 1:
-        ck::tensor_operation::device::instance::
-            add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs);
-        break;
-    case 2:
-        ck::tensor_operation::device::instance::
-            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
-        break;
-    case 3:
-        ck::tensor_operation::device::instance::
-            add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs);
-        break;
-    default: break;
-    }
-}
-template <>
-void get_device_conv_bwd_data_op_ptr(
-    F16, F16, F16, std::vector<DeviceConvBwdDataNoOpPtr>& conv_ptrs, int num_dim_spatial)
-{
-    switch(num_dim_spatial)
-    {
-    case 1:
-        ck::tensor_operation::device::instance::
-            add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs);
-        break;
-    case 2:
-        ck::tensor_operation::device::instance::
-            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
-        break;
-    case 3:
-        ck::tensor_operation::device::instance::
-            add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs);
-        break;
-    default: break;
-    }
-}
-template <>
-void get_device_conv_bwd_data_op_ptr(
-    BF16, BF16, BF16, std::vector<DeviceConvBwdDataNoOpPtr>& conv_ptrs, int num_dim_spatial)
-{
-    switch(num_dim_spatial)
-    {
-    case 1:
-        ck::tensor_operation::device::instance::
-            add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs);
-        break;
-    case 2:
-        ck::tensor_operation::device::instance::
-            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
-        break;
-    case 3:
-        ck::tensor_operation::device::instance::
-            add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs);
-        break;
-    default: break;
-    }
-}
-template <>
-void get_device_conv_bwd_data_op_ptr(
-    INT8, INT8, INT8, std::vector<DeviceConvBwdDataNoOpPtr>& conv_ptrs, int num_dim_spatial)
-{
-    switch(num_dim_spatial)
-    {
-    case 1:
-        ck::tensor_operation::device::instance::
-            add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(conv_ptrs);
-        break;
-    case 2:
-        ck::tensor_operation::device::instance::
-            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs);
-        break;
-    case 3:
-        ck::tensor_operation::device::instance::
-            add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(conv_ptrs);
-        break;
-    default: break;
-    }
-}
-
-template <typename T>
-static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
-{
-    float max_diff = 1e-6;
-
-    for(std::size_t i = 0; i < ref.mData.size(); ++i)
-    {
-        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
-        if(max_diff < diff)
-        {
-            return false;
-        }
-    }
-    return true;
-}
-template <typename DataType>
-void show_data_nhwc_layout(Tensor<DataType>& nhwc)
-{
-    std::cout << "[";
-    for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
-    {
-        std::cout << "[";
-        for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
-        {
-            std::cout << "[";
-            for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
-            {
-                std::cout << "[";
-                for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
-                {
-                    std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << "  ";
-                }
-                std::cout << "]";
-            }
-            std::cout << "]";
-        }
-        std::cout << "]";
-    }
-    std::cout << "]";
-}
-
-template <int NDimSpatial,
-          typename InDataType,
-          typename WeiDataType,
-          typename OutDataType,
-          typename AccDataType,
-          typename InLayout,
-          typename WeiLayout,
-          typename OutLayout>
-bool profile_convnd_bwd_data_impl(int do_verification,
-                                  int init_method,
-                                  bool do_log,
-                                  bool time_kernel,
-                                  ck::index_t N,
-                                  ck::index_t K,
-                                  ck::index_t C,
-                                  const std::vector<ck::index_t>& input_spatial_lengths,
-                                  const std::vector<ck::index_t>& filter_spatial_lengths,
-                                  const std::vector<ck::index_t>& output_spatial_lengths,
-                                  const std::vector<ck::index_t>& conv_filter_strides,
-                                  const std::vector<ck::index_t>& conv_filter_dilations,
-                                  const std::vector<ck::index_t>& input_left_pads,
-                                  const std::vector<ck::index_t>& input_right_pads)
-{
-    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-
-    const auto in_element_op  = InElementOp{};
-    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{};
-
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(C)};
-    input_dims.insert(
-        std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths));
-
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(K), static_cast<std::size_t>(C)};
-    filter_dims.insert(std::end(filter_dims),
-                       std::begin(filter_spatial_lengths),
-                       std::end(filter_spatial_lengths));
-
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(K)};
-    output_dims.insert(std::end(output_dims),
-                       std::begin(output_spatial_lengths),
-                       std::end(output_spatial_lengths));
-
-    Tensor<InDataType> input_host_result(
-        get_input_host_tensor_descriptor<InLayout>(input_dims, NDimSpatial));
-    Tensor<InDataType> input_device_result(
-        get_input_host_tensor_descriptor<InLayout>(input_dims, NDimSpatial));
-    Tensor<WeiDataType> weights(
-        get_filters_host_tensor_descriptor<WeiLayout>(filter_dims, NDimSpatial));
-    Tensor<OutDataType> output(
-        get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial));
-
-    std::cout << "input: " << input_host_result.mDesc << std::endl;
-    std::cout << "weights: " << weights.mDesc << std::endl;
-    std::cout << "output: " << output.mDesc << std::endl;
-
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        output.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
-        weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-        break;
-    default:
-        output.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
-        weights.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
-    }
-
-    DeviceMem in_device_buf(sizeof(InDataType) * input_device_result.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
-
-    out_device_buf.ToDevice(output.mData.data());
-    wei_device_buf.ToDevice(weights.mData.data());
-
-    // reset input to zero
-    in_device_buf.SetZero();
-
-    if(do_verification)
-    {
-        auto RunReference = [&](auto& ref_conv) {
-            auto ref_invoker = ref_conv.MakeInvoker();
-
-            auto ref_argument = ref_conv.MakeArgument(input_host_result,
-                                                      weights,
-                                                      output,
-                                                      conv_filter_strides,
-                                                      conv_filter_dilations,
-                                                      input_left_pads,
-                                                      input_right_pads,
-                                                      InElementOp{},
-                                                      WeiElementOp{},
-                                                      OutElementOp{});
-            ref_invoker.Run(ref_argument);
-        };
-
-        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<InDataType,
-                                                                         WeiDataType,
-                                                                         OutDataType,
-                                                                         AccDataType,
-                                                                         InElementOp,
-                                                                         WeiElementOp,
-                                                                         OutElementOp,
-                                                                         NDimSpatial>();
-        RunReference(ref_conv);
-    }
-
-    // add device Conv instances
-    std::vector<DeviceConvBwdDataNoOpPtr> conv_ptrs;
-    get_device_conv_bwd_data_op_ptr(
-        InDataType{}, WeiDataType{}, OutDataType{}, conv_ptrs, NDimSpatial);
-
-    if(conv_ptrs.size() <= 0)
-    {
-        throw std::runtime_error("wrong! no device Conv instance found");
-    }
-
-    std::string best_conv_name;
-    float best_ave_time   = 0;
-    float best_tflops     = 0;
-    float best_gb_per_sec = 0;
-
-    // profile device Conv instances
-    bool success = true;
-    for(auto& conv_ptr : conv_ptrs)
-    {
-        auto argument_ptr = conv_ptr->MakeArgumentPointer(
-            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-            N,
-            K,
-            C,
-            input_spatial_lengths,
-            filter_spatial_lengths,
-            output_spatial_lengths,
-            conv_filter_strides,
-            conv_filter_dilations,
-            input_left_pads,
-            input_right_pads,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
-
-        auto invoker_ptr = conv_ptr->MakeInvokerPointer();
-
-        if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-            std::string conv_name = conv_ptr->GetTypeString();
-
-            float ave_time =
-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-
-            std::size_t flop =
-                ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths);
-            std::size_t num_btype =
-                ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>(
-                    N, C, K, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths);
-
-            float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-            float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                      << " GB/s" << std::endl;
-
-            if(tflops > best_tflops)
-            {
-                best_conv_name  = conv_name;
-                best_tflops     = tflops;
-                best_ave_time   = ave_time;
-                best_gb_per_sec = gb_per_sec;
-            }
-
-            if(do_verification)
-            {
-                in_device_buf.FromDevice(input_device_result.mData.data());
-
-                if(!check_out(input_host_result, input_device_result))
-                {
-                    std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
-
-                    success = false;
-                }
-                else
-                {
-                    std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
-                }
-
-                success = ck::utils::check_err(input_host_result, input_device_result);
-
-                if(do_log)
-                {
-                    std::cout << "in : ";
-                    show_data_nhwc_layout(output);
-                    std::cout << std::endl;
-
-                    std::cout << "wei: ";
-                    show_data_nhwc_layout(weights);
-                    std::cout << std::endl;
-
-                    std::cout << "out_host  : ";
-                    show_data_nhwc_layout(input_host_result);
-                    std::cout << std::endl;
-
-                    std::cout << "out_device: ";
-                    show_data_nhwc_layout(input_device_result);
-                    std::cout << std::endl;
-                }
-            }
-        }
-    }
-
-    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
-              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
-    return success;
-}
-
-} // namespace profiler
-} // namespace ck
diff --git a/profiler/include/profiler/profile_convnd_bwd_weight_impl.hpp b/profiler/include/profiler/profile_convnd_bwd_weight_impl.hpp
deleted file mode 100644
index e37c887a9..000000000
--- a/profiler/include/profiler/profile_convnd_bwd_weight_impl.hpp
+++ /dev/null
@@ -1,474 +0,0 @@
-#pragma once
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_conv_backward_weight.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/conv_util.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"
-
-using F16  = ck::half_t;
-using F32  = float;
-using BF16 = ck::bhalf_t;
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using DeviceConvndBwdWeightNoOpPtr =
-    DeviceConvBwdWeightPtr<ck::tensor_operation::element_wise::PassThrough,
-                           ck::tensor_operation::element_wise::PassThrough,
-                           ck::tensor_operation::element_wise::PassThrough>;
-
-void add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f32_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-void add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-void add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-
-void add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f16_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-void add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-void add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f16_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-
-void add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_bf16_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-void add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_bf16_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-void add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
-    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-
-namespace ck {
-namespace profiler {
-
-using DeviceConvndBwdWeightNoOpPtr =
-    ck::tensor_operation::device::instance::DeviceConvndBwdWeightNoOpPtr;
-
-template <typename InLayout>
-HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::size_t>& dims,
-                                                      int num_dim_spatial = 2)
-{
-    namespace tl = ck::tensor_layout::convolution;
-
-    switch(num_dim_spatial)
-    {
-    case 3: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
-    }
-    case 2: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
-    }
-    case 1: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
-    }
-    default: {
-        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
-    }
-    }
-}
-
-template <typename WeiLayout>
-HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::size_t>& dims,
-                                                        int num_dim_spatial = 2)
-{
-    namespace tl = ck::tensor_layout::convolution;
-
-    switch(num_dim_spatial)
-    {
-    case 3: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
-    }
-    case 2: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
-    }
-    case 1: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
-    }
-    default: {
-        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
-    }
-    }
-}
-
-template <typename OutLayout>
-HostTensorDescriptor get_output_host_ensor_descriptor(const std::vector<std::size_t>& dims,
-                                                      int num_dim_spatial = 2)
-{
-    namespace tl = ck::tensor_layout::convolution;
-
-    switch(num_dim_spatial)
-    {
-    case 3: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
-    }
-    case 2: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
-    }
-    case 1: {
-        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
-    }
-    default: {
-        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
-    }
-    }
-}
-
-template <typename InDataType, typename WeiDataType, typename OutDataType>
-void get_device_conv_bwd_weight_op_ptr(
-    InDataType, WeiDataType, OutDataType, std::vector<DeviceConvndBwdWeightNoOpPtr>&, int)
-{
-    std::cout << "can not find device conv bwd weight" << std::endl;
-    exit(1);
-}
-
-template <>
-void get_device_conv_bwd_weight_op_ptr(
-    F32, F32, F32, std::vector<DeviceConvndBwdWeightNoOpPtr>& conv_ptrs, int num_dim_spatial)
-{
-    switch(num_dim_spatial)
-    {
-    case 1:
-        ck::tensor_operation::device::instance::
-            add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs);
-        break;
-    case 2:
-        ck::tensor_operation::device::instance::
-            add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
-        break;
-    case 3:
-        ck::tensor_operation::device::instance::
-            add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs);
-        break;
-    default: break;
-    }
-}
-
-template <>
-void get_device_conv_bwd_weight_op_ptr(
-    F16, F16, F16, std::vector<DeviceConvndBwdWeightNoOpPtr>& conv_ptrs, int num_dim_spatial)
-{
-    switch(num_dim_spatial)
-    {
-    case 1:
-        ck::tensor_operation::device::instance::
-            add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs);
-        break;
-    case 2:
-        ck::tensor_operation::device::instance::
-            add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
-        break;
-    case 3:
-        ck::tensor_operation::device::instance::
-            add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs);
-        break;
-    default: break;
-    }
-}
-
-template <>
-void get_device_conv_bwd_weight_op_ptr(
-    BF16, BF16, BF16, std::vector<DeviceConvndBwdWeightNoOpPtr>& conv_ptrs, int num_dim_spatial)
-{
-    switch(num_dim_spatial)
-    {
-    case 1:
-        ck::tensor_operation::device::instance::
-            add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs);
-        break;
-    case 2:
-        ck::tensor_operation::device::instance::
-            add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
-        break;
-    case 3:
-        ck::tensor_operation::device::instance::
-            add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs);
-        break;
-    default: break;
-    }
-}
-
-template <typename DataType>
-void show_data_nhwc_layout(Tensor<DataType>& nhwc)
-{
-    std::cout << "[";
-    for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
-    {
-        std::cout << "[";
-        for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
-        {
-            std::cout << "[";
-            for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
-            {
-                std::cout << "[";
-                for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
-                {
-                    std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << "  ";
-                }
-                std::cout << "]";
-            }
-            std::cout << "]";
-        }
-        std::cout << "]";
-    }
-    std::cout << "]";
-}
-
-template <int NDimSpatial,
-          typename InDataType,
-          typename WeiDataType,
-          typename OutDataType,
-          typename InLayout,
-          typename WeiLayout,
-          typename OutLayout>
-bool profile_convnd_bwd_weight_impl(int do_verification,
-                                    int init_method,
-                                    bool do_log,
-                                    bool time_kernel,
-                                    ck::index_t N,
-                                    ck::index_t K,
-                                    ck::index_t C,
-                                    std::vector<ck::index_t> input_spatial_lengths,
-                                    std::vector<ck::index_t> filter_spatial_lengths,
-                                    std::vector<ck::index_t> output_spatial_lengths,
-                                    std::vector<ck::index_t> conv_filter_strides,
-                                    std::vector<ck::index_t> conv_filter_dilations,
-                                    std::vector<ck::index_t> input_left_pads,
-                                    std::vector<ck::index_t> input_right_pads,
-                                    ck::index_t split_k)
-{
-    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-
-    const auto in_element_op  = InElementOp{};
-    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{};
-
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(C)};
-    input_dims.insert(
-        std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths));
-
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(K), static_cast<std::size_t>(C)};
-    filter_dims.insert(std::end(filter_dims),
-                       std::begin(filter_spatial_lengths),
-                       std::end(filter_spatial_lengths));
-
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(K)};
-    output_dims.insert(std::end(output_dims),
-                       std::begin(output_spatial_lengths),
-                       std::end(output_spatial_lengths));
-
-    Tensor<InDataType> input(get_input_host_tensor_descriptor<InLayout>(input_dims, NDimSpatial));
-    Tensor<WeiDataType> weights_host_result(
-        get_filters_host_tensor_descriptor<WeiLayout>(filter_dims, NDimSpatial));
-    Tensor<WeiDataType> weights_device_result(
-        get_filters_host_tensor_descriptor<WeiLayout>(filter_dims, NDimSpatial));
-    Tensor<OutDataType> output(
-        get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial));
-
-    std::cout << "input: " << input.mDesc << std::endl;
-    std::cout << "weights: " << weights_host_result.mDesc << std::endl;
-    std::cout << "output: " << output.mDesc << std::endl;
-
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        input.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
-        output.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
-        break;
-    default:
-        input.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
-        output.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
-    }
-
-    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights_device_result.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
-
-    in_device_buf.ToDevice(input.mData.data());
-    out_device_buf.ToDevice(output.mData.data());
-
-    // reset input to zero
-    wei_device_buf.SetZero();
-
-    if(do_verification)
-    {
-        auto RunReference = [&](auto& ref_conv) {
-            auto ref_invoker = ref_conv.MakeInvoker();
-
-            auto ref_argument = ref_conv.MakeArgument(input,
-                                                      weights_host_result,
-                                                      output,
-                                                      conv_filter_strides,
-                                                      conv_filter_dilations,
-                                                      input_left_pads,
-                                                      input_right_pads,
-                                                      InElementOp{},
-                                                      WeiElementOp{},
-                                                      OutElementOp{});
-            ref_invoker.Run(ref_argument);
-        };
-
-        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<InDataType,
-                                                                           WeiDataType,
-                                                                           OutDataType,
-                                                                           InElementOp,
-                                                                           WeiElementOp,
-                                                                           OutElementOp,
-                                                                           NDimSpatial>();
-        RunReference(ref_conv);
-    }
-
-    // add device Conv instances
-    std::vector<DeviceConvndBwdWeightNoOpPtr> conv_ptrs;
-    get_device_conv_bwd_weight_op_ptr(
-        InDataType{}, WeiDataType{}, OutDataType{}, conv_ptrs, NDimSpatial);
-
-    if(conv_ptrs.size() <= 0)
-    {
-        throw std::runtime_error("wrong! no device Conv instance found");
-    }
-
-    std::string best_conv_name;
-    float best_ave_time   = 0;
-    float best_tflops     = 0;
-    float best_gb_per_sec = 0;
-
-    // profile device Conv instances
-    bool success = true;
-    for(auto& conv_ptr : conv_ptrs)
-    {
-        // using atomic, so need to reset input, setzero is done in invoker
-        // if(split_k > 1)
-        //{
-        //    wei_device_buf.SetZero();
-        //}
-
-        auto argument_ptr = conv_ptr->MakeArgumentPointer(
-            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-            N,
-            K,
-            C,
-            input_spatial_lengths,
-            filter_spatial_lengths,
-            output_spatial_lengths,
-            conv_filter_strides,
-            conv_filter_dilations,
-            input_left_pads,
-            input_right_pads,
-            in_element_op,
-            wei_element_op,
-            out_element_op,
-            split_k);
-
-        if(!conv_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-            std::cout << "wrong! device_conv with the specified compilation parameters does "
-                         "not support this Conv problem"
-                      << std::endl;
-            continue;
-        }
-
-        auto invoker_ptr      = conv_ptr->MakeInvokerPointer();
-        std::string conv_name = conv_ptr->GetTypeString();
-        float ave_time        = 0;
-
-        if(std::is_same<InDataType, ck::bhalf_t>::value && split_k > 1)
-        {
-            // alloc work space
-            size_t bwd_weight_workspace_size = conv_ptr->GetWorkSpaceSize(argument_ptr.get());
-            if(bwd_weight_workspace_size <= 0)
-            {
-                printf("wrong work space size\n");
-                exit(1);
-            }
-            DeviceMem wei_work_space_device_buf(bwd_weight_workspace_size);
-            wei_work_space_device_buf.SetZero();
-            conv_ptr->SetWorkSpacePointer(argument_ptr.get(),
-                                          wei_work_space_device_buf.GetDeviceBuffer());
-            ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-        }
-        else
-        {
-            ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-        }
-
-        std::size_t flop =
-            ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths);
-        std::size_t num_btype = ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>(
-            N, C, K, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths);
-
-        float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-        float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                  << " GB/s" << std::endl;
-
-        if(tflops > best_tflops)
-        {
-            best_conv_name  = conv_name;
-            best_tflops     = tflops;
-            best_ave_time   = ave_time;
-            best_gb_per_sec = gb_per_sec;
-        }
-
-        if(do_verification)
-        {
-            wei_device_buf.FromDevice(weights_device_result.mData.data());
-
-            success = ck::utils::check_err(weights_host_result, weights_device_result);
-
-            if(success == false)
-            {
-                std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
-            }
-            else
-            {
-                std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
-            }
-
-            if(do_log)
-            {
-                std::cout << "in : ";
-                show_data_nhwc_layout(output);
-                std::cout << std::endl;
-
-                std::cout << "wei: ";
-                show_data_nhwc_layout(weights_host_result);
-                std::cout << std::endl;
-
-                std::cout << "out  : ";
-                show_data_nhwc_layout(input);
-                std::cout << std::endl;
-
-                std::cout << "wei_device: ";
-                show_data_nhwc_layout(weights_device_result);
-                std::cout << std::endl;
-            }
-        }
-    }
-
-    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
-              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
-    return success;
-}
-
-} // namespace profiler
-} // namespace ck
-- 
GitLab


From c2d7a29dec6a9a6f05195fba561d8bcb4b305c3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Tue, 30 May 2023 14:07:17 +0200
Subject: [PATCH 43/71] Add instances for fp16/int8 Gemm kernels (Navi21)
 (#717)

* Add instances for fp16/int8 Gemm kernels (Navi21)

* Extend instances with smaller tiles

* Fix SrcVectorTensor for km_kn_mn int8
---
 .../tensor_operation_instance/gpu/gemm.hpp    | 48 +++++++++++
 .../gpu/gemm/CMakeLists.txt                   |  8 ++
 ..._gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp | 34 +++++++-
 ...16_f16_f16_km_kn_mn_irregular_instance.cpp | 73 +++++++++++++++++
 ..._gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp | 34 +++++++-
 ...16_f16_f16_km_nk_mn_irregular_instance.cpp | 73 +++++++++++++++++
 ..._gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp | 34 +++++++-
 ...16_f16_f16_mk_kn_mn_irregular_instance.cpp | 73 +++++++++++++++++
 ..._gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp | 34 +++++++-
 ...16_f16_f16_mk_nk_mn_irregular_instance.cpp | 74 +++++++++++++++++
 ...ice_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp | 35 +++++++-
 ...l_i8_i8_i8_km_kn_mn_irregular_instance.cpp | 80 +++++++++++++++++++
 ...ice_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp | 35 +++++++-
 ...l_i8_i8_i8_km_nk_mn_irregular_instance.cpp | 80 +++++++++++++++++++
 ...ice_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp | 35 +++++++-
 ...l_i8_i8_i8_mk_kn_mn_irregular_instance.cpp | 80 +++++++++++++++++++
 ...ice_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp | 35 +++++++-
 ...l_i8_i8_i8_mk_nk_mn_irregular_instance.cpp | 80 +++++++++++++++++++
 18 files changed, 937 insertions(+), 8 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
index 0a8f2215b..732d98069 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -22,21 +22,41 @@ void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(
         DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
@@ -63,21 +83,41 @@ void add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(
         DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Col, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Row, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -295,6 +335,7 @@ struct DeviceOperationInstanceFactory<
             {
                 add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
                 add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
                 add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
@@ -302,6 +343,7 @@ struct DeviceOperationInstanceFactory<
             {
                 add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
                 add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs);
                 add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
                 add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
             }
@@ -310,6 +352,7 @@ struct DeviceOperationInstanceFactory<
             {
                 add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
                 add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+                add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs);
                 add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
@@ -317,6 +360,7 @@ struct DeviceOperationInstanceFactory<
             {
                 add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
                 add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+                add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs);
                 add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs);
             }
         }
@@ -352,24 +396,28 @@ struct DeviceOperationInstanceFactory<
             {
                 add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(op_ptrs);
                 add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instances(op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(op_ptrs);
                 add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances(op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(op_ptrs);
                 add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(op_ptrs);
+                add_device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instances(op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(op_ptrs);
                 add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(op_ptrs);
+                add_device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instances(op_ptrs);
             }
         }
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
index e20d592c8..d66010af7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -33,11 +33,19 @@ add_instance_library(device_gemm_instance
    device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
    device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
    device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
+   device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
    device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
+   device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
    device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
+   device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp
    device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
+   device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp
    device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
+   device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp
    device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
+   device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp
    device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
+   device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp
    device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
+   device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
index 5d2f18e14..35df85b7a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
@@ -34,7 +34,39 @@ using device_gemm_dl_f16_f16_f16_km_kn_mn_instances = std::tuple<
         // #########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
         // #########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
         // #########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
-        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<4, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<16, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp
new file mode 100644
index 000000000..d444e29aa
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances = std::tuple<
+    // clang-format off
+        // #########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // #########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // #########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // #########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<4, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<16, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
index 01e3b3793..7d0863c95 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
@@ -34,7 +34,39 @@ using device_gemm_dl_f16_f16_f16_km_nk_mn_instances = std::tuple<
         // #########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
         // #########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
         // #########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
-        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // MPerBlock=128, NPerBlock=128       
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp
new file mode 100644
index 000000000..f8f8a0bd3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances = std::tuple<
+    // clang-format off
+        // #########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // #########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // #########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        // #########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128       
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64       
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
index 804e86a06..c1b11b19d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -34,7 +34,39 @@ using device_gemm_dl_f16_f16_f16_mk_kn_mn_instances = std::tuple<
         // #########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
         // #########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
         // #########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
-        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
new file mode 100644
index 000000000..840a4fabe
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances = std::tuple<
+    // clang-format off
+        // #########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // #########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // #########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        // #########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,       S<4, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
index 159fa90f7..9a889e8d5 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -35,7 +35,39 @@ using device_gemm_dl_f16_f16_f16_mk_nk_mn_instances =
         //  ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
         //  ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
         //  ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
-        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
         // clang-format on
         >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
new file mode 100644
index 000000000..82b1b5dc2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances =
+    std::tuple<
+        // clang-format off
+        //  ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        //  ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        //  ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        //  ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+        // clang-format on
+        >;
+
+void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
index a4208245e..9f5cebcab 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
@@ -31,7 +31,40 @@ using device_gemm_dl_i8_i8_i8_km_kn_mn_instances = std::tuple<
         // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
         // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
         // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
-        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  4,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<1, 1, 4, 4>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<4, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  4,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<1, 1, 4, 4>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<4, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  4,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  4,          4,          1,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  4,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 4>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<4, 1, 4, 4>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  4,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 4>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<8, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  4,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 4>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<8, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  4,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  4,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  4,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  4,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  4,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  4,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp
new file mode 100644
index 000000000..043920151
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instances = std::tuple<
+    // clang-format off
+        // #########|  AData|   BData|   CData|    AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<4, 4>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    32,    32,    32,     8,  4,          2,          4,      1,       S<4, 2>,       S<2, 2>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<4, 2>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    16,    16,    16,    16,  4,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 4>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 4, 4>,       S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    16,    16,    16,    16,  4,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 4>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 4, 4>,       S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,     8,    64,    32,  4,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 4>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 4, 4>,      S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,     8,    32,  4,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<1, 1, 4, 4>,      S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,     8,     8,     8,     4,  4,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<1, 1, 4, 4>,       S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,     8,     8,     8,     4,  4,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<1, 1, 4, 4>,       S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,     8,     8,     8,     4,  4,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<1, 1, 4, 4>,       S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,     8,     8,     8,     4,  4,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<1, 1, 4, 4>,       S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
index 06fab7f68..41afb519f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
@@ -31,7 +31,40 @@ using device_gemm_dl_i8_i8_i8_km_nk_mn_instances = std::tuple<
         // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
         // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
         // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>
+        // MPerBlock=128, NPerBlock=128       
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,      S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp
new file mode 100644
index 000000000..350834f7e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instances = std::tuple<
+    // clang-format off
+        // #########|  AData|   BData|   CData|    AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        // MPerBlock=128, NPerBlock=128       
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<4, 4>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          2,          4,      1,       S<4, 2>,       S<2, 2>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,      S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<4, 2>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,      S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,      S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
index b6d72fa22..d1173095f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
@@ -31,7 +31,40 @@ using device_gemm_dl_i8_i8_i8_mk_kn_mn_instances = std::tuple<
         // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
         // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                |                   |
         // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                |                   |
-        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp
new file mode 100644
index 000000000..27397527b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instances = std::tuple<
+    // clang-format off
+        // #########|  AData|   BData|   CData|    AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                |                   |
+        // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<4, 4>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          2,          4,      1,       S<4, 2>,       S<2, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
index 67d2e3ce4..efda345a8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -31,7 +31,40 @@ using device_gemm_dl_i8_i8_i8_mk_nk_mn_instances = std::tuple<
         // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
         // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
         // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp
new file mode 100644
index 000000000..b99f3f2b6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances = std::tuple<
+    // clang-format off
+        // #########|  AData|   BData|   CData|    AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<4, 4>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          2,          4,      1,       S<4, 2>,       S<2, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
-- 
GitLab


From 70e4eb567fcad81c57598ab9ee6f81b4136ecca5 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Tue, 30 May 2023 14:09:06 +0200
Subject: [PATCH 44/71] Multiple fixes to GroupedGemm+SplitK (#707)

* Add license header.

* Reduce number of logged output. Add constant initialization.

* Add functional tests for grouped_gemm with different kbatch value.

* Add debug log informations + remove unused code.

* Don't pass kbatch to CalculateKPadded.

* Turn on logging in grouped gemm and gemm splitk profiler

* Debug: limit number of test cases to run;

* Log more information and initialize with constant value.

* Turn on DEBUG_LOG

* Add more debug log informations.

* Limit the number of instances to compile.

* Use GridwiseGemmPipeline

* Use KBatch to calculate K0

* Multiple DebugLog messages.

* Unit tests for multiple KBatch values.

* Refactoring

* Disable logging
* extract out of if statement KBatch update.

* Uncomment instances.

* Disable DebugLog.

* Use Kbatch when calculate KPadded.

* Fix CGridDesc padding.

* Use available helper functions.

* Uncomment code commented for debuggin.

* Remove unnecessary debug log messages.

* Uncomment previously commented code for debug purposes.

* Add KBatch info to profiler output summary log.

* Add gtests for gemm splitk using ckProfiler API.

* Add more test-cases for different data layout.

* Add more test cases for gemm splitk

* Remove old test.

* Unit tests for MKNK ggemm interface.

* Fix and add more unit-tests.

* Constepxr everything!

* Increase error threshold for fp16 and splitk.

Since we're using fp16 atomic add for splitk there's a
known precision loss.

---------

Co-authored-by: Adam Osewski <aosewski@amd.com>
Co-authored-by: zjing14 <zhangjing14@gmail.com>
---
 .../gpu/device/device_grouped_gemm.hpp        |   4 +
 .../impl/device_gemm_xdl_splitk_c_shuffle.hpp |  14 +-
 ...evice_grouped_gemm_xdl_splitk_cshuffle.hpp |  28 +-
 .../gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp  | 180 +++++++-----
 ...gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp |   1 +
 ...16_f16_f16_mk_kn_mn_irregular_instance.cpp |   6 +-
 ...16_f16_f16_mk_nk_mn_irregular_instance.cpp |   4 +-
 .../profiler/profile_gemm_splitk_impl.hpp     |   6 +-
 .../profiler/profile_grouped_gemm_impl.hpp    | 118 +++++---
 test/gemm_split_k/CMakeLists.txt              |   5 +-
 test/gemm_split_k/gemm_split_k.cpp            | 261 ------------------
 test/gemm_split_k/test_gemm_splitk.cpp        |  66 +++++
 .../test_gemm_splitk_ut_cases.inc             | 217 +++++++++++++++
 test/gemm_split_k/test_gemm_splitk_util.hpp   |  78 ++++++
 test/grouped_gemm/CMakeLists.txt              |  10 +-
 test/grouped_gemm/grouped_gemm_fp16.cpp       |  69 -----
 .../test_grouped_gemm_interface.cpp           | 202 ++++++++++++++
 .../grouped_gemm/test_grouped_gemm_splitk.cpp |  34 +++
 .../test_grouped_gemm_ut_cases.inc            | 180 ++++++++++++
 test/grouped_gemm/test_grouped_gemm_util.hpp  | 249 +++++++++++++++++
 20 files changed, 1262 insertions(+), 470 deletions(-)
 delete mode 100644 test/gemm_split_k/gemm_split_k.cpp
 create mode 100644 test/gemm_split_k/test_gemm_splitk.cpp
 create mode 100644 test/gemm_split_k/test_gemm_splitk_ut_cases.inc
 create mode 100644 test/gemm_split_k/test_gemm_splitk_util.hpp
 delete mode 100644 test/grouped_gemm/grouped_gemm_fp16.cpp
 create mode 100644 test/grouped_gemm/test_grouped_gemm_interface.cpp
 create mode 100644 test/grouped_gemm/test_grouped_gemm_splitk.cpp
 create mode 100644 test/grouped_gemm/test_grouped_gemm_ut_cases.inc
 create mode 100644 test/grouped_gemm/test_grouped_gemm_util.hpp

diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
index 4b1106c12..1e0340553 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
@@ -1,4 +1,8 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
 #pragma once
+
 #include <iostream>
 #include <vector>
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
index 776f96e8e..89bfc180a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -73,6 +73,11 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
     static constexpr auto I2 = Number<2>{};
     static constexpr auto I3 = Number<3>{};
 
+    // TODO: should be exposed as Tparams.
+    static constexpr index_t NumGemmKPrefetchStage = 1;
+    static constexpr LoopScheduler LoopSched       = make_default_loop_scheduler();
+    static constexpr PipelineVersion PipelineVer   = PipelineVersion::v2;
+
     using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
         BlockSize,
         ADataType, // TODO: distinguish A/B datatype
@@ -85,6 +90,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
         BElementwiseOperation,
         CElementwiseOperation,
         GemmSpec,
+        NumGemmKPrefetchStage,
         MPerBlock,
         NPerBlock,
         K0PerBlock,
@@ -112,7 +118,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
         CShuffleMRepeatPerShuffle,
         CShuffleNRepeatPerShuffle,
         CBlockTransferScalarPerVector_NWaveNPerXDL,
-        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        LoopSched,
+        PipelineVer>;
 
     using Argument              = typename GridwiseGemm::Argument;
     using DefaultBlock2CTileMap = typename GridwiseGemm::DefaultBlock2CTileMap;
@@ -257,7 +265,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                         StrideC,
                         GridwiseGemm::CalculateMPadded(M),
                         GridwiseGemm::CalculateNPadded(N),
-                        GridwiseGemm::CalculateKPadded(K),
+                        GridwiseGemm::CalculateKPadded(K, KBatch),
                         GridwiseGemm::CalculateK0(K, KBatch),
                         KBatch};
     }
@@ -290,7 +298,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                           StrideC,
                                           GridwiseGemm::CalculateMPadded(M),
                                           GridwiseGemm::CalculateNPadded(N),
-                                          GridwiseGemm::CalculateKPadded(K),
+                                          GridwiseGemm::CalculateKPadded(K, KBatch),
                                           GridwiseGemm::CalculateK0(K, KBatch),
                                           KBatch);
     }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index 467a8429a..54ad9eb06 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -85,7 +85,7 @@ template <typename ALayout,
           typename BElementwiseOperation,
           typename CDEElementwiseOperation,
           GemmSpecialization GemmSpec,
-          ck::index_t NumPrefetch,
+          ck::index_t NumGemmKPrefetchStage,
           ck::index_t BlockSize,
           ck::index_t MPerBlock,
           ck::index_t NPerBlock,
@@ -152,6 +152,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         BElementwiseOperation,
         CDEElementwiseOperation,
         GemmSpec,
+        NumGemmKPrefetchStage,
         MPerBlock,
         NPerBlock,
         K0PerBlock,
@@ -179,7 +180,9 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         CShuffleMXdlPerWavePerShuffle,
         CShuffleNXdlPerWavePerShuffle,
         CDEBlockTransferScalarPerVector_NPerBlock,
-        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        LoopSched,
+        PipelineVersion::v2>;
 
     using CGridDesc_M_N = typename GridwiseGemm::CGridDesc_M_N;
     using Block2ETileMapKSplit =
@@ -265,8 +268,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 const index_t k_padded = GridwiseGemm::CalculateKPadded(K, K_BATCH);
                 const index_t k0       = GridwiseGemm::CalculateK0(K, K_BATCH);
 
-                const auto c_grid_desc_m_n =
-                    GridwiseGemm::MakeCGridDescriptor_M_N(M, N, m_padded, n_padded, stride_c);
+                const auto c_grid_desc_m_n = GridwiseGemm::MakeCGridDescriptor_M_N(M, N, stride_c);
 
                 const auto local_b2c_tile_map =
                     Block2ETileMapKSplit{c_grid_desc_m_n, B2E_M01, K_BATCH};
@@ -319,8 +321,8 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 const index_t k_padded = GridwiseGemm::CalculateKPadded(karg.K, K_BATCH);
                 const index_t k0       = GridwiseGemm::CalculateK0(karg.K, K_BATCH);
 
-                const auto c_grid_desc_m_n = GridwiseGemm::MakeCGridDescriptor_M_N(
-                    karg.M, karg.N, karg.MPadded, karg.NPadded, karg.StrideC);
+                const auto c_grid_desc_m_n =
+                    GridwiseGemm::MakeCGridDescriptor_M_N(karg.M, karg.N, karg.StrideC);
 
                 const auto local_b2c_tile_map =
                     Block2ETileMapKSplit{c_grid_desc_m_n, B2E_M01, K_BATCH};
@@ -501,6 +503,11 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         if((ck::type_convert<ck::index_t>(arg.gemm_kernel_args_.size()) +
             arg.skipped_group_count_) != arg.group_count_)
         {
+#if DEBUG_LOG
+            std::cout << "The group count is not equal to sum of skipped groups "
+                         "and kernel args size!"
+                      << std::endl;
+#endif // DEBUG_LOG
             return false;
         }
 
@@ -509,14 +516,15 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         {
             const auto& a        = arg.gemm_kernel_args_[i].karg_;
             bool group_arg_valid = GridwiseGemm::CheckValidity(a);
-#if DEBUG_LOG
             if(not group_arg_valid)
             {
-                std::cout << "[" << __func__ << "] group id: " << i << " is not supported!\n";
+#if DEBUG_LOG
+                std::cout << "[" << __func__ << "] group id: " << i
+                          << " has invalid GridwiseGemm settings!" << std::endl;
                 a.Print();
-            }
 #endif // DEBUG_LOG
-            supported &= group_arg_valid;
+            }
+            supported = supported && group_arg_valid;
         }
         return supported;
     }
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
index b393c4897..d56d1986e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -8,14 +8,14 @@
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 
 namespace ck {
 
@@ -55,6 +55,7 @@ template <index_t BlockSize,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
           tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
           index_t MPerBlock,
           index_t NPerBlock,
           index_t K0PerBlock,
@@ -82,7 +83,9 @@ template <index_t BlockSize,
           index_t CShuffleMRepeatPerShuffle,
           index_t CShuffleNRepeatPerShuffle,
           index_t CBlockTransferScalarPerVector_NWaveNPerXDL,
-          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>
+          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          LoopScheduler LoopSched     = make_default_loop_scheduler(),
+          PipelineVersion PipelineVer = PipelineVersion::v1>
 struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
 {
     static constexpr auto I0 = Number<0>{};
@@ -99,8 +102,15 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
     static constexpr auto M01 = 1;
     static constexpr auto N01 = 1;
 
+    static constexpr auto gemm_padder =
+        tensor_operation::device::GemmPadder<GemmSpec, index_t, index_t, index_t>{
+            MPerBlock, NPerBlock, K1* K0PerBlock};
+
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>())>;
+
     struct Argument : public ck::tensor_operation::device::BaseArgument
     {
         const FloatAB* p_a_grid;
@@ -176,12 +186,12 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
     // prefer this to be called on host
     __host__ __device__ static auto CalculateMPadded(index_t M)
     {
-        return (M + MPerBlock - 1) / MPerBlock * MPerBlock;
+        return math::integer_least_multiple(M, MPerBlock);
     }
 
     __host__ __device__ static auto CalculateNPadded(index_t N)
     {
-        return (N + NPerBlock - 1) / NPerBlock * NPerBlock;
+        return math::integer_least_multiple(N, NPerBlock);
     }
 
     __host__ __device__ static auto CalculateK0(index_t K, index_t K_Batch = 1)
@@ -295,8 +305,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         }
     }
 
-    __host__ __device__ static auto
-    MakeCGridDescriptor_M_N(index_t M, index_t N, index_t MPad, index_t NPad, index_t StrideC)
+    __host__ __device__ static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
     {
         const auto c_grid_desc_m_n = [&]() {
             if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
@@ -309,22 +318,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
             }
         }();
 
-        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
-        {
-            return transform_tensor_descriptor(c_grid_desc_m_n,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
+        return gemm_padder.PadCDescriptor_M_N(c_grid_desc_m_n);
     }
 
     __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
@@ -383,7 +377,15 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
         {
             if(!(karg.M % MPerBlock == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
         }
         if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
@@ -391,40 +393,116 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
         {
             if(!(karg.N % NPerBlock == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
         }
 
         if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
         {
             if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K (" << karg.K
+                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
         }
         else
         {
             if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M (" << karg.M
+                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
         }
 
         if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
         {
             if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N (" << karg.N
+                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
         }
         else
         {
             if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K (" << karg.K
+                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
         }
 
         if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
         {
             if(karg.N % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
+            {
+#if DEBUG_LOG
+                std::cout
+                    << "Arg N (" << karg.N
+                    << ") value is not a multiple of CBlockTransferScalarPerVector_NWaveNPerXDL ("
+                    << CBlockTransferScalarPerVector_NWaveNPerXDL << " )! " << __FILE__ << ":"
+                    << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
         }
         else
         {
             if(karg.M % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
+            {
+#if DEBUG_LOG
+                std::cout
+                    << "Arg M (" << karg.M
+                    << ") value is not a multiple of CBlockTransferScalarPerVector_NWaveNPerXDL ("
+                    << CBlockTransferScalarPerVector_NWaveNPerXDL << " )! " << __FILE__ << ":"
+                    << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
+        }
+
+        const auto num_k_loop = karg.K0 / K0PerBlock;
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+#if DEBUG_LOG
+            std::cout << "The number of k loops (" << num_k_loop
+                      << ") value is not supported by GridwiseGemm Pipeline."
+                      << " K0: " << karg.K0 << ", K0PerBlock: " << K0PerBlock << " " << __FILE__
+                      << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+#endif // DEBUG_LOG
+            return false;
         }
 
         return true;
@@ -439,9 +517,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
 
     __host__ __device__ static constexpr bool CalculateHasMainK0BlockLoop(index_t K0)
     {
-        const bool has_main_k0_block_loop = K0 > K0PerBlock;
-
-        return has_main_k0_block_loop;
+        const index_t num_loop = K0 / K0PerBlock;
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
     }
 
     template <typename CGridDesc>
@@ -490,7 +567,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         return BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>();
     }
 
-    using CGridDesc_M_N         = remove_cvref_t<decltype(MakeCGridDescriptor_M_N(1, 1, 1, 1, 1))>;
+    using CGridDesc_M_N         = remove_cvref_t<decltype(MakeCGridDescriptor_M_N(1, 1, 1))>;
     using DefaultBlock2CTileMap = remove_cvref_t<decltype(MakeDefaultBlock2CTileMap())>;
 
     template <bool HasMainKBlockLoop,
@@ -507,8 +584,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
             karg.M, karg.MPadded, karg.K, karg.StrideA, karg.k_batch, karg.K0, karg.KPadded);
         const auto b_b_k0_n_k1_grid_desc = MakeBGridDescriptor_KBatch_K0_N_K1(
             karg.K, karg.NPadded, karg.N, karg.StrideB, karg.k_batch, karg.K0, karg.KPadded);
-        const auto c_grid_desc_m_n =
-            MakeCGridDescriptor_M_N(karg.M, karg.N, karg.MPadded, karg.NPadded, karg.StrideC);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(karg.M, karg.N, karg.StrideC);
 
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n);
@@ -680,20 +756,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
         //       register
         // sanity check
-#if 1
-        auto blockwise_gemm =
-            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                FloatAB,
-                                                                FloatAcc,
-                                                                decltype(a_k0_m_k1_block_desc),
-                                                                decltype(b_k0_n_k1_block_desc),
-                                                                MPerXDL,
-                                                                NPerXDL,
-                                                                MRepeat,
-                                                                NRepeat,
-                                                                K1>{};
-#else
-        auto blockwise_gemm = BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<
+
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
             FloatAB,
             FloatAcc,
@@ -703,9 +767,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
             NPerXDL,
             MRepeat,
             NRepeat,
-            K1>{};
-
-#endif
+            K1,
+            LoopSched>();
 
         auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
 
@@ -761,7 +824,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                 b_blockwise_copy.RunWrite(b_b_k0_n_k1_block_desc, b_block_buf);
 
                 k0_block_data_begin += K0PerBlock;
-            } while(k0_block_data_begin < (K0 - K0PerBlock));
+            } while(k0_block_data_begin < (karg.K0 - K0PerBlock));
         }
 
         // tail
@@ -772,13 +835,12 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         }
 #else
         // gridwise GEMM pipeline
-        const auto gridwise_gemm_pipeline =
-            GridwiseGemmPipeline_Selector<PipelineVersion::v2, 1, LoopScheduler::Default>();
-
         const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
             (a_b_k0_m_k1_grid_desc.GetLength(I1) * a_b_k0_m_k1_grid_desc.GetLength(I3)) /
             (K0PerBlock * K1));
 
+        const auto gridwise_gemm_pipeline = GridwiseGemmPipe{};
+
         gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_b_k0_m_k1_grid_desc,
                                                                a_b_k0_m_k1_block_desc,
                                                                a_blockwise_copy,
@@ -993,24 +1055,6 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         }
     }
 
-    template <typename Layout>
-    struct LStr
-    {
-        static std::string Get() { return ""; }
-    };
-
-    template <>
-    struct LStr<ck::tensor_layout::gemm::RowMajor>
-    {
-        static std::string Get() { return "R"; }
-    };
-
-    template <>
-    struct LStr<ck::tensor_layout::gemm::ColumnMajor>
-    {
-        static std::string Get() { return "C"; }
-    };
-
     static std::string GetTypeString()
     {
         auto str = std::stringstream();
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
index a93cb7fc8..5f5d6c9b5 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -64,6 +64,7 @@ using device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_irregular_tile_instances = st
         //###################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //###################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //###################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
index 0385b0fc0..a3d73440e 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
@@ -44,14 +44,14 @@ using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_tile_instanc
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-//      DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-//      DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
-//      DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
index 5933ff61e..dddfa2aa4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
@@ -37,7 +37,7 @@ using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instanc
         //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-//      DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
@@ -45,7 +45,7 @@ using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instanc
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,    
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
-//      DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   256,    32,   8,   8,   32,   32,    1,    4,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
diff --git a/profiler/include/profiler/profile_gemm_splitk_impl.hpp b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
index 4cc62509d..ab1bce258 100644
--- a/profiler/include/profiler/profile_gemm_splitk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
@@ -246,9 +246,9 @@ bool profile_gemm_splitk_impl(int do_verification,
     }
 
     std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
-              << " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_ave_time
-              << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
-              << best_op_name << std::endl;
+              << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << KBatch
+              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
+              << " GB/s, " << best_op_name << std::endl;
 
     return pass;
 }
diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
index 23dca244d..9abb5e7a5 100644
--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -19,6 +19,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/fill.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -43,7 +44,6 @@ bool profile_grouped_gemm_impl(int do_verification,
                                const std::vector<int>& StrideCs,
                                int kbatch = 1)
 {
-
     bool pass = true;
 
     auto f_host_tensor_descriptor =
@@ -81,11 +81,11 @@ bool profile_grouped_gemm_impl(int do_verification,
 
         c_m_n_device_results.push_back(
             Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
-
+#if DEBUG_LOG
         std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i
                   << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
                   << "]:" << c_m_n_device_results[i].mDesc << std::endl;
-
+#endif // DEBUG_LOG
         std::size_t num_thread = 1;
         switch(init_method)
         {
@@ -191,65 +191,71 @@ bool profile_grouped_gemm_impl(int do_verification,
         DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get()));
 
         gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
+        std::string gemm_name = gemm_ptr->GetTypeString();
 
-        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
+        if(kbatch > 1)
         {
-            std::string gemm_name = gemm_ptr->GetTypeString();
-
-            if(kbatch > 1)
+            using DeviceOpSplitK =
+                ck::tensor_operation::device::DeviceGroupedGemmSplitK<ALayout,
+                                                                      BLayout,
+                                                                      ck::Tuple<>,
+                                                                      CLayout,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      ck::Tuple<>,
+                                                                      CDataType,
+                                                                      AElementOp,
+                                                                      BElementOp,
+                                                                      CElementOp>;
+
+            if(dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get()) != nullptr)
             {
-                using DeviceOpSplitK =
-                    ck::tensor_operation::device::DeviceGroupedGemmSplitK<ALayout,
-                                                                          BLayout,
-                                                                          ck::Tuple<>,
-                                                                          CLayout,
-                                                                          ADataType,
-                                                                          BDataType,
-                                                                          ck::Tuple<>,
-                                                                          CDataType,
-                                                                          AElementOp,
-                                                                          BElementOp,
-                                                                          CElementOp>;
-
-                if(dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get()) != nullptr)
-                {
-                    dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
-                        ->SetKBatchSize(argument_ptr.get(), kbatch);
-                }
+                dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
+                    ->SetKBatchSize(argument_ptr.get(), kbatch);
             }
+        }
+
+        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
 
             float ave_time =
                 invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
-            std::size_t flop = 0, num_btype = 0;
-            for(std::size_t i = 0; i < gemm_descs.size(); i++)
+            if(time_kernel)
             {
-                flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
+                std::size_t flop = 0, num_btype = 0;
+                for(std::size_t i = 0; i < gemm_descs.size(); i++)
+                {
+                    flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
 
-                num_btype += sizeof(ADataType) * Ms[i] * Ks[i] + sizeof(BDataType) * Ks[i] * Ns[i] +
-                             sizeof(CDataType) * Ms[i] * Ns[i];
-            }
+                    num_btype += sizeof(ADataType) * Ms[i] * Ks[i] +
+                                 sizeof(BDataType) * Ks[i] * Ns[i] +
+                                 sizeof(CDataType) * Ms[i] * Ns[i];
+                }
 
-            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+                float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
 
-            float gb_per_sec = num_btype / 1.E6 / ave_time;
-            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
-                      << gb_per_sec << " GB/s, " << gemm_name << std::endl;
+                float gb_per_sec = num_btype / 1.E6 / ave_time;
+                std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
+                          << " TFlops, " << gb_per_sec << " GB/s, " << gemm_name << std::endl;
 
-            if(tflops > best_tflops)
-            {
-                best_gemm_name  = gemm_name;
-                best_tflops     = tflops;
-                best_ave_time   = ave_time;
-                best_gb_per_sec = gb_per_sec;
+                if(tflops > best_tflops)
+                {
+                    best_gemm_name  = gemm_name;
+                    best_tflops     = tflops;
+                    best_ave_time   = ave_time;
+                    best_gb_per_sec = gb_per_sec;
+                }
             }
 
             if(do_verification)
             {
+                bool instance_pass = true;
                 for(std::size_t i = 0; i < gemm_descs.size(); i++)
                 {
 
                     c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
+                    c_device_buf[i]->SetZero();
 
                     Tensor<CDataType> c_m_n_host_result(
                         f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}));
@@ -274,7 +280,20 @@ bool profile_grouped_gemm_impl(int do_verification,
                                                               c_element_op);
 
                     ref_invoker.Run(ref_argument);
-                    pass = pass && ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
+                    if(std::is_same_v<CDataType, ck::half_t> && kbatch > 1)
+                    {
+                        instance_pass =
+                            instance_pass && ck::utils::check_err(c_m_n_device_results[i],
+                                                                  c_m_n_host_result,
+                                                                  "Error: Incorrect results!",
+                                                                  0.06);
+                    }
+                    else
+                    {
+                        instance_pass =
+                            instance_pass &&
+                            ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
+                    }
 
                     if(do_log)
                     {
@@ -289,16 +308,25 @@ bool profile_grouped_gemm_impl(int do_verification,
                             << std::endl;
                     }
                 }
+
+                std::cout << "Instance: " << gemm_name << " verification "
+                          << (instance_pass ? "SUCCEED" : "FAILED") << std::endl;
+
+                pass = pass && instance_pass;
             }
         }
         else
         {
-            std::cout << "does not support this GEMM problem" << std::endl;
+            std::cout << "Instance: " << gemm_name << ", does not support this GEMM problem"
+                      << std::endl;
         }
     }
 
-    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
-              << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
+    if(time_kernel)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+                  << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
+    }
 
     return pass;
 }
diff --git a/test/gemm_split_k/CMakeLists.txt b/test/gemm_split_k/CMakeLists.txt
index 09bbf7938..2274854f8 100644
--- a/test/gemm_split_k/CMakeLists.txt
+++ b/test/gemm_split_k/CMakeLists.txt
@@ -1,5 +1,4 @@
 if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
-   add_test_executable(test_gemm_split_k gemm_split_k.cpp)
-   target_link_libraries(test_gemm_split_k PRIVATE utility)
-   target_link_libraries(test_gemm_split_k PRIVATE device_gemm_splitk_instance)
+   add_gtest_executable(test_gemm_splitk test_gemm_splitk.cpp)
+   target_link_libraries(test_gemm_splitk PRIVATE utility device_gemm_splitk_instance)
 endif()
diff --git a/test/gemm_split_k/gemm_split_k.cpp b/test/gemm_split_k/gemm_split_k.cpp
deleted file mode 100644
index 1edb5769c..000000000
--- a/test/gemm_split_k/gemm_split_k.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-#include <initializer_list>
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/literals.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
-
-#include "ck/library/utility/host_gemm.hpp"
-
-enum struct GemmMatrixLayout
-{
-    MK_KN_MN, // 0
-    MK_NK_MN, // 1
-    KM_KN_MN, // 2
-    KM_NK_MN, // 3
-};
-
-template <typename T>
-static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
-{
-    float max_diff = 1e-6;
-
-    for(std::size_t i = 0; i < ref.mData.size(); ++i)
-    {
-        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
-        if(max_diff < diff)
-        {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-struct gemmArgs
-{
-    GemmMatrixLayout layout;
-    int M;
-    int N;
-    int K;
-    int StrideA;
-    int StrideB;
-    int StrideC;
-    int KBatch;
-};
-
-int test_gemm(const gemmArgs& args)
-{
-    using Row = ck::tensor_layout::gemm::RowMajor;
-    using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-    bool a_row_major, b_row_major, c_row_major;
-
-    switch(args.layout)
-    {
-    case GemmMatrixLayout::MK_KN_MN:
-        a_row_major = true;
-        b_row_major = true;
-        c_row_major = true;
-        break;
-    case GemmMatrixLayout::MK_NK_MN:
-        a_row_major = true;
-        b_row_major = false;
-        c_row_major = true;
-        break;
-    case GemmMatrixLayout::KM_KN_MN:
-        a_row_major = false;
-        b_row_major = true;
-        c_row_major = true;
-        break;
-    case GemmMatrixLayout::KM_NK_MN:
-        a_row_major = false;
-        b_row_major = false;
-        c_row_major = true;
-        break;
-    default: printf("not supported layout"); return 1;
-    }
-
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, bool row_major) {
-            using namespace ck::literals;
-
-            if(row_major)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
-
-    Tensor<float> a_m_k(f_host_tensor_descriptor(args.M, args.K, args.StrideA, a_row_major));
-    Tensor<float> b_k_n(f_host_tensor_descriptor(args.K, args.N, args.StrideB, b_row_major));
-    Tensor<float> c_m_n_host_result(
-        f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major));
-    Tensor<float> c_m_n_device_result(
-        f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major));
-
-    // init data
-    std::size_t num_thread = 1;
-    a_m_k.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
-    b_k_n.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
-    // set zero to c_device_buf
-    c_m_n_device_result.GenerateTensorValue(GeneratorTensor_0<float>{}, num_thread);
-
-    host_gemm_mk_kn_mn(a_m_k,
-                       b_k_n,
-                       c_m_n_host_result,
-                       ck::tensor_operation::element_wise::PassThrough{},
-                       ck::tensor_operation::element_wise::PassThrough{},
-                       ck::tensor_operation::element_wise::PassThrough{});
-
-    DeviceMem a_device_buf(sizeof(float) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(float) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem c_device_buf(sizeof(float) * c_m_n_device_result.mDesc.GetElementSpaceSize());
-
-    a_device_buf.ToDevice(a_m_k.mData.data());
-    b_device_buf.ToDevice(b_k_n.mData.data());
-    c_device_buf.ToDevice(c_m_n_device_result.mData.data());
-
-    auto test = [&](auto a_layout, auto b_layout, auto c_layout) {
-        bool success = false;
-
-        using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<decltype(a_layout),
-                                                                        decltype(b_layout),
-                                                                        decltype(c_layout),
-                                                                        float,
-                                                                        float,
-                                                                        float,
-                                                                        PassThrough,
-                                                                        PassThrough,
-                                                                        PassThrough>;
-
-        const auto gemm_ptrs =
-            ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-                DeviceOp>::GetInstances();
-
-        for(auto& gemm_ptr : gemm_ptrs)
-        {
-            auto argument_ptr =
-                gemm_ptr->MakeArgumentPointer(static_cast<float*>(a_device_buf.GetDeviceBuffer()),
-                                              static_cast<float*>(b_device_buf.GetDeviceBuffer()),
-                                              static_cast<float*>(c_device_buf.GetDeviceBuffer()),
-                                              args.M,
-                                              args.N,
-                                              args.K,
-                                              args.StrideA,
-                                              args.StrideB,
-                                              args.StrideC,
-                                              ck::tensor_operation::element_wise::PassThrough{},
-                                              ck::tensor_operation::element_wise::PassThrough{},
-                                              ck::tensor_operation::element_wise::PassThrough{},
-                                              args.KBatch);
-
-            auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
-
-            if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
-            {
-                invoker_ptr->Run(argument_ptr.get());
-
-                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
-
-                if(!check_out(c_m_n_host_result, c_m_n_device_result))
-                {
-                    success = false;
-                    break;
-                }
-                success = true;
-            }
-        }
-
-        return success;
-    };
-
-    bool success = false;
-
-    if(args.layout == GemmMatrixLayout::MK_KN_MN)
-    {
-        success = test(Row{}, Row{}, Row{});
-    }
-    else if(args.layout == GemmMatrixLayout::MK_NK_MN)
-    {
-        success = test(Row{}, Col{}, Row{});
-    }
-    else if(args.layout == GemmMatrixLayout::KM_KN_MN)
-    {
-        success = test(Col{}, Row{}, Row{});
-    }
-    else
-    {
-        success = test(Col{}, Col{}, Row{});
-    }
-
-    auto error_code = 0;
-    if(success)
-    {
-        std::cout << "test split k : Pass" << std::endl;
-    }
-    else
-    {
-        std::cout << "test split k: Fail " << std::endl;
-        error_code = -1; // test needs to report failure
-    }
-    return error_code;
-}
-
-int main(int argc, char* argv[])
-{
-    std::vector<gemmArgs> test_cases;
-    if(argc == 1)
-    {
-        test_cases = {{GemmMatrixLayout::MK_KN_MN, 1024, 1024, 1024, 1024, 1024, 1024, 2},
-                      {GemmMatrixLayout::MK_KN_MN, 1024, 1024, 1024, 1024, 1024, 1024, 8}};
-    }
-    else if(argc == 9)
-    {
-        const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[1]));
-
-        const int M = std::stoi(argv[2]);
-        const int N = std::stoi(argv[3]);
-        const int K = std::stoi(argv[4]);
-
-        const int StrideA = std::stoi(argv[5]);
-        const int StrideB = std::stoi(argv[6]);
-        const int StrideC = std::stoi(argv[7]);
-        const int KBatch  = std::stoi(argv[8]);
-        test_cases        = {{layout, M, N, K, StrideA, StrideB, StrideC, KBatch}};
-    }
-    else
-    {
-        printf("arg1: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
-        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
-        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
-        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
-        printf("arg2 to 7: M, N, K, StrideA, StrideB, StrideC KBatch\n");
-        return -1;
-    }
-    bool error = false;
-    for(const auto& kinder : test_cases)
-    {
-        error |= test_gemm(kinder);
-    }
-    return error ? 1 : 0;
-}
diff --git a/test/gemm_split_k/test_gemm_splitk.cpp b/test/gemm_split_k/test_gemm_splitk.cpp
new file mode 100644
index 000000000..9eba5bba3
--- /dev/null
+++ b/test/gemm_split_k/test_gemm_splitk.cpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_splitk_util.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmSplitK_MK_KN
+    : public ck::test::TestGemmSplitK<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmSplitK_MK_NK
+    : public ck::test::TestGemmSplitK<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmSplitK_KM_KN
+    : public ck::test::TestGemmSplitK<typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmSplitK_KM_NK
+    : public ck::test::TestGemmSplitK<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //         ADataType, BDataType, CDataType
+    std::tuple<      F16,       F16,       F16>,
+    std::tuple<      F32,       F32,       F32>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmSplitK_MK_KN, KernelTypes);
+TYPED_TEST_SUITE(TestGemmSplitK_MK_NK, KernelTypes);
+TYPED_TEST_SUITE(TestGemmSplitK_KM_KN, KernelTypes);
+TYPED_TEST_SUITE(TestGemmSplitK_KM_NK, KernelTypes);
+
+#include "test_gemm_splitk_ut_cases.inc"
diff --git a/test/gemm_split_k/test_gemm_splitk_ut_cases.inc b/test/gemm_split_k/test_gemm_splitk_ut_cases.inc
new file mode 100644
index 000000000..54b9c6c9e
--- /dev/null
+++ b/test/gemm_split_k/test_gemm_splitk_ut_cases.inc
@@ -0,0 +1,217 @@
+#pragma once
+
+TYPED_TEST(TestGemmSplitK_MK_KN, SmallM)
+{
+    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_MK_NK, SmallM)
+{
+    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_KN, SmallM)
+{
+    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_NK, SmallM)
+{
+    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_MK_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_MK_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_MK_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_MK_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_MK_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_MK_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
diff --git a/test/gemm_split_k/test_gemm_splitk_util.hpp b/test/gemm_split_k/test_gemm_splitk_util.hpp
new file mode 100644
index 000000000..8243747a6
--- /dev/null
+++ b/test/gemm_split_k/test_gemm_splitk_util.hpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include <sstream>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "include/ck/utility/data_type.hpp"
+#include "profiler/profile_gemm_splitk_impl.hpp"
+
+namespace ck {
+namespace test {
+
+template <typename Tuple>
+class TestGemmSplitK : public testing::Test
+{
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using F32 = float;
+
+    protected:
+    using ALayout   = std::tuple_element_t<0, Tuple>;
+    using BLayout   = std::tuple_element_t<1, Tuple>;
+    using CLayout   = Row;
+    using ADataType = std::tuple_element_t<2, Tuple>;
+    using BDataType = std::tuple_element_t<3, Tuple>;
+    using CDataType = std::tuple_element_t<4, Tuple>;
+
+    public:
+    static constexpr bool verify_     = true;
+    static constexpr int init_method_ = 1; // decimal value initialization
+    static constexpr bool log_        = false;
+    static constexpr bool bench_      = false; // measure kernel performance
+    std::vector<int> k_batches_;
+
+    void SetUp() override { k_batches_ = {1, 2, 3, 5, 8}; }
+
+    void Run(const int M,
+             const int N,
+             const int K,
+             const int StrideA,
+             const int StrideB,
+             const int StrideC)
+    {
+        for(auto kb : k_batches_)
+        {
+            RunSingle(M, N, K, StrideA, StrideB, StrideC, kb);
+        }
+    }
+
+    void RunSingle(const int M,
+                   const int N,
+                   const int K,
+                   const int StrideA,
+                   const int StrideB,
+                   const int StrideC,
+                   int kbatch = 1)
+    {
+        bool pass = ck::profiler::profile_gemm_splitk_impl<ADataType,
+                                                           BDataType,
+                                                           F32,
+                                                           CDataType,
+                                                           ALayout,
+                                                           BLayout,
+                                                           CLayout>(
+            verify_, init_method_, log_, bench_, M, N, K, StrideA, StrideB, StrideC, kbatch);
+        EXPECT_TRUE(pass);
+    }
+};
+
+} // namespace test
+} // namespace ck
diff --git a/test/grouped_gemm/CMakeLists.txt b/test/grouped_gemm/CMakeLists.txt
index a7619eac6..40f634d8b 100644
--- a/test/grouped_gemm/CMakeLists.txt
+++ b/test/grouped_gemm/CMakeLists.txt
@@ -1,5 +1,9 @@
 if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
-   add_test_executable(test_grouped_gemm_fp16 grouped_gemm_fp16.cpp)
-   target_link_libraries(test_grouped_gemm_fp16 PRIVATE utility)
-   target_link_libraries(test_grouped_gemm_fp16 PRIVATE device_grouped_gemm_instance)
+   add_custom_target(test_grouped_gemm)
+   add_gtest_executable(test_grouped_gemm_splitk test_grouped_gemm_splitk.cpp)
+   add_gtest_executable(test_grouped_gemm_interface test_grouped_gemm_interface.cpp)
+   target_link_libraries(test_grouped_gemm_splitk PRIVATE utility device_grouped_gemm_instance)
+   target_link_libraries(test_grouped_gemm_interface PRIVATE utility device_grouped_gemm_instance)
+   
+   add_dependencies(test_grouped_gemm test_grouped_gemm_splitk test_grouped_gemm_interface)
 endif()
diff --git a/test/grouped_gemm/grouped_gemm_fp16.cpp b/test/grouped_gemm/grouped_gemm_fp16.cpp
deleted file mode 100644
index f20d750d3..000000000
--- a/test/grouped_gemm/grouped_gemm_fp16.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-#include <random>
-
-#include "profiler/profile_grouped_gemm_impl.hpp"
-
-namespace {
-
-using ADataType   = ck::half_t;
-using BDataType   = ck::half_t;
-using CDataType   = ck::half_t;
-using AccDataType = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-template <typename ALayout, typename BLayout, typename CLayout>
-bool TestGroupedGemm()
-{
-
-    std::mt19937 gen(19391);
-    std::uniform_int_distribution<> distrib(1, 10);
-    int group_count = distrib(gen);
-
-    // GEMM shape
-    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
-    std::vector<const void*> p_a, p_b;
-    std::vector<void*> p_c;
-
-    std::vector<int> Ms, Ns, Ks, StrideAs, StrideBs, StrideCs;
-
-    for(int i = 0; i < group_count; i++)
-    {
-        Ms.push_back(256 + 256 * distrib(gen));
-        Ns.push_back(256 + 256 * distrib(gen));
-        Ks.push_back(128 + 128 * distrib(gen));
-
-        StrideAs.push_back(std::is_same<Row, ALayout>::value ? Ks[i] : Ms[i]);
-        StrideBs.push_back(std::is_same<Row, BLayout>::value ? Ns[i] : Ks[i]);
-        StrideCs.push_back(std::is_same<Row, CLayout>::value ? Ns[i] : Ms[i]);
-    }
-
-    return ck::profiler::profile_grouped_gemm_impl<ADataType,
-                                                   BDataType,
-                                                   CDataType,
-                                                   AccDataType,
-                                                   ALayout,
-                                                   BLayout,
-                                                   CLayout>(
-        true, 1, false, 1, Ms, Ns, Ks, StrideAs, StrideBs, StrideCs);
-}
-
-} // anonymous namespace
-
-int main()
-{
-    bool res = true;
-
-    res = res && TestGroupedGemm<Row, Row, Row>();
-    res = res && TestGroupedGemm<Row, Col, Row>();
-    res = res && TestGroupedGemm<Col, Row, Row>();
-    res = res && TestGroupedGemm<Col, Col, Row>();
-
-    std::cout << "TestGroupedGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
-
-    return res ? 0 : 1;
-}
diff --git a/test/grouped_gemm/test_grouped_gemm_interface.cpp b/test/grouped_gemm/test_grouped_gemm_interface.cpp
new file mode 100644
index 000000000..ffa8840fc
--- /dev/null
+++ b/test/grouped_gemm/test_grouped_gemm_interface.cpp
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <stdexcept>
+#include <vector>
+#include "gtest/gtest.h"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "test_grouped_gemm_util.hpp"
+
+class TestGGemmSplitKInterface_MKNKMN : public ::testing::Test
+{
+    protected:
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    using ALayout = Row;
+    using BLayout = Col;
+    using ELayout = Row;
+
+    static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+    template <ck::tensor_operation::device::GemmSpecialization GemmSpec,
+              ck::index_t KPerBlock,
+              ck::index_t K1,
+              ck::index_t ABlockTransferSrcScalarPerVector,
+              ck::index_t BBlockTransferSrcScalarPerVector,
+              ck::index_t CDEBlockTransferScalarPerVector_NPerBlock>
+    using GGemmInstance =
+        ck::test::DeviceGroupedGemmSplitkInstanceWrapper<ALayout,
+                                                         BLayout,
+                                                         ELayout,
+                                                         GemmSpec,
+                                                         KPerBlock,
+                                                         K1,
+                                                         ABlockTransferSrcScalarPerVector,
+                                                         BBlockTransferSrcScalarPerVector,
+                                                         CDEBlockTransferScalarPerVector_NPerBlock>;
+
+    using DefaultGGemmInstance = GGemmInstance<GemmDefault, 32, 8, 4, 8, 8>;
+};
+
+TEST_F(TestGGemmSplitKInterface_MKNKMN, TileSize)
+{
+    std::vector<int> Ms{128, 256, 188, 512};
+    constexpr int N = 256;
+    constexpr int K = 128;
+
+    std::vector<int> Ns(Ms.size(), N);
+    std::vector<int> Ks(Ms.size(), K);
+    std::vector<int> StrideAs(Ms.size(), K);
+    std::vector<int> StrideBs(Ms.size(), K);
+    std::vector<int> StrideCs(Ms.size(), N);
+
+    // M % MPerBlock
+    EXPECT_FALSE(DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+
+    Ms = std::vector<int>{256, 128, 128, 512};
+    Ns = std::vector<int>{256, 177, 128, 512};
+    // N % NPerBlock
+    EXPECT_FALSE(DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+}
+
+TEST_F(TestGGemmSplitKInterface_MKNKMN, VectorLoadWidth)
+{
+    static constexpr auto GemmMNKPadding =
+        ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+    using PaddedGGemmInstance = GGemmInstance<GemmMNKPadding, 32, 8, 4, 8, 8>;
+
+    std::vector<int> Ms{128, 256, 256, 512};
+    constexpr int N = 256;
+    constexpr int K = 512;
+
+    std::vector<int> Ns(Ms.size(), N);
+    std::vector<int> Ks(Ms.size(), K);
+    std::vector<int> StrideAs(Ms.size(), K);
+    std::vector<int> StrideBs(Ms.size(), K);
+    std::vector<int> StrideCs(Ms.size(), N);
+
+    // K % ABlockTransferSrcScalarPerVector
+    Ks = std::vector<int>{256, 177, 128, 512};
+    EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+
+    Ks = std::vector<int>{256, 164, 128, 512};
+    // K % BBlockTransferSrcScalarPerVector
+    EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+
+    Ks = std::vector<int>(4, 128);
+    Ns = std::vector<int>{256, 127, 128, 512};
+    // N % CBlockTransferScalarPerVector_NWaveNPerXDL
+    EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+}
+
+TEST_F(TestGGemmSplitKInterface_MKNKMN, KLoops)
+{
+    std::vector<int> Ms{128, 256, 256, 512};
+    constexpr int N      = 256;
+    constexpr int K      = 128;
+    constexpr int kbatch = 4;
+
+    std::vector<int> Ns(Ms.size(), N);
+    std::vector<int> Ks(Ms.size(), K);
+    std::vector<int> StrideAs(Ms.size(), K);
+    std::vector<int> StrideBs(Ms.size(), K);
+    std::vector<int> StrideCs(Ms.size(), N);
+
+    // kloops % 2
+    Ks = std::vector<int>{256, 512, 320, 768};
+    EXPECT_FALSE(
+        DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch));
+
+    // Not all gemms have same value for main_k0_block_loop!
+    Ks = std::vector<int>{256, 512, 512, 512};
+    EXPECT_THROW(DefaultGGemmInstance{}.Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch),
+                 std::runtime_error);
+}
+
+class TestGGemmSplitKInterface_KMKNNM : public ::testing::Test
+{
+    protected:
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    using ALayout = Col;
+    using BLayout = Row;
+    using ELayout = Col;
+
+    static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+    template <ck::tensor_operation::device::GemmSpecialization GemmSpec,
+              ck::index_t KPerBlock,
+              ck::index_t K1,
+              ck::index_t ABlockTransferSrcScalarPerVector,
+              ck::index_t BBlockTransferSrcScalarPerVector,
+              ck::index_t CDEBlockTransferScalarPerVector_NPerBlock>
+    using GGemmInstance =
+        ck::test::DeviceGroupedGemmSplitkInstanceWrapper<ALayout,
+                                                         BLayout,
+                                                         ELayout,
+                                                         GemmSpec,
+                                                         KPerBlock,
+                                                         K1,
+                                                         ABlockTransferSrcScalarPerVector,
+                                                         BBlockTransferSrcScalarPerVector,
+                                                         CDEBlockTransferScalarPerVector_NPerBlock>;
+
+    using DefaultGGemmInstance = GGemmInstance<GemmDefault, 32, 8, 4, 8, 4>;
+};
+
+TEST_F(TestGGemmSplitKInterface_KMKNNM, TileSize)
+{
+    std::vector<int> Ms{128, 256, 188, 512};
+    constexpr int N = 256;
+    constexpr int K = 128;
+
+    std::vector<int> Ns(Ms.size(), N);
+    std::vector<int> Ks(Ms.size(), K);
+    std::vector<int> StrideAs(Ms.size(), K);
+    std::vector<int> StrideBs(Ms.size(), K);
+    std::vector<int> StrideCs(Ms.size(), N);
+
+    // M % MPerBlock
+    EXPECT_FALSE(DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+
+    Ms = std::vector<int>{128, 256, 256, 512};
+    Ns = std::vector<int>{256, 177, 128, 512};
+    // N % NPerBlock
+    EXPECT_FALSE(DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+}
+
+TEST_F(TestGGemmSplitKInterface_KMKNNM, VectorLoadWidth)
+{
+    static constexpr auto GemmMNKPadding =
+        ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+    using PaddedGGemmInstance = GGemmInstance<GemmMNKPadding, 32, 8, 2, 8, 4>;
+
+    std::vector<int> Ms{128, 256, 256, 512};
+    constexpr int N = 256;
+    constexpr int K = 512;
+
+    std::vector<int> Ns(Ms.size(), N);
+    std::vector<int> Ks(Ms.size(), K);
+    std::vector<int> StrideAs(Ms.size(), K);
+    std::vector<int> StrideBs(Ms.size(), K);
+    std::vector<int> StrideCs(Ms.size(), N);
+
+    // M % ABlockTransferSrcScalarPerVector
+    Ms = std::vector<int>{256, 177, 128, 512};
+    EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+
+    Ms = std::vector<int>{128, 256, 256, 512};
+    Ns = std::vector<int>{256, 164, 128, 512};
+    // N % BBlockTransferSrcScalarPerVector
+    EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+
+    Ns = std::vector<int>{128, 256, 256, 512};
+    Ms = std::vector<int>{256, 130, 128, 512};
+    // M % CBlockTransferScalarPerVector_NWaveNPerXDL
+    EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+}
diff --git a/test/grouped_gemm/test_grouped_gemm_splitk.cpp b/test/grouped_gemm/test_grouped_gemm_splitk.cpp
new file mode 100644
index 000000000..d9282fa92
--- /dev/null
+++ b/test/grouped_gemm/test_grouped_gemm_splitk.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "gtest/gtest.h"
+#include "test_grouped_gemm_util.hpp"
+
+using F16 = ck::half_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using RRR_F16_F16_F16 = ck::test::TestGroupedGemm<std::tuple<Row, Row, Row, F16, F16, F16>>;
+using RCR_F16_F16_F16 = ck::test::TestGroupedGemm<std::tuple<Row, Col, Row, F16, F16, F16>>;
+
+using RRR_F16_F16_F16_LargeK = ck::test::TestGroupedGemm<std::tuple<Row, Row, Row, F16, F16, F16>>;
+using RCR_F16_F16_F16_LargeK = ck::test::TestGroupedGemm<std::tuple<Row, Col, Row, F16, F16, F16>>;
+
+const std::vector<int> KBATCH{1, 2, 3, 5, 8};
+
+INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_MK_KN, RRR_F16_F16_F16, testing::ValuesIn(KBATCH));
+INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_MK_NK, RCR_F16_F16_F16, testing::ValuesIn(KBATCH));
+INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_LargeK_MK_KN,
+                         RRR_F16_F16_F16_LargeK,
+                         testing::Values(32, 64));
+INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_LargeK_MK_NK,
+                         RCR_F16_F16_F16_LargeK,
+                         testing::Values(32, 64));
+
+#include "test_grouped_gemm_ut_cases.inc"
diff --git a/test/grouped_gemm/test_grouped_gemm_ut_cases.inc b/test/grouped_gemm/test_grouped_gemm_ut_cases.inc
new file mode 100644
index 000000000..d94d140d9
--- /dev/null
+++ b/test/grouped_gemm/test_grouped_gemm_ut_cases.inc
@@ -0,0 +1,180 @@
+#pragma once
+
+TEST_P(RRR_F16_F16_F16, TinyCases)
+{
+    const std::vector<int> Ms{0, 1};
+    constexpr int N = 768;
+    constexpr int K = 544;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), N);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RRR_F16_F16_F16, SmallCases)
+{
+    const std::vector<int> Ms{2, 1, 3, 4, 5, 0};
+    constexpr int N = 768;
+    constexpr int K = 544;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), N);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RRR_F16_F16_F16, MidCases)
+{
+    const std::vector<int> Ms{167, 183, 177, 153, 139, 204};
+    constexpr int N = 768;
+    constexpr int K = 544;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), N);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RRR_F16_F16_F16, Regular)
+{
+    const std::vector<int> Ms{64, 128, 256};
+    constexpr int N = 768;
+    constexpr int K = 320;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), N);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RRR_F16_F16_F16, MNKPadded)
+{
+    const std::vector<int> Ms{127, 150, 188, 210};
+    constexpr int N = 136;
+    constexpr int K = 280;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), N);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RCR_F16_F16_F16, TinyCases)
+{
+    const std::vector<int> Ms{0, 1};
+    constexpr int N = 768;
+    constexpr int K = 544;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), K);
+    const std::vector<int> StrideCs(Ms.size(), N);
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RCR_F16_F16_F16, SmallCases)
+{
+    const std::vector<int> Ms{2, 1, 3, 4, 5, 0};
+    constexpr int N = 768;
+    constexpr int K = 544;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), K);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RCR_F16_F16_F16, MidCases)
+{
+    const std::vector<int> Ms{167, 183, 177, 153, 139, 204};
+    constexpr int N = 768;
+    constexpr int K = 544;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), K);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RCR_F16_F16_F16, Regular)
+{
+    const std::vector<int> Ms{32, 64, 128, 256};
+    constexpr int N = 768;
+    constexpr int K = 320;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), K);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RCR_F16_F16_F16, MNKPadded)
+{
+    const std::vector<int> Ms{127, 150, 188, 210};
+    constexpr int N = 136;
+    constexpr int K = 280;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), K);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RRR_F16_F16_F16_LargeK, TestLargeKBatch)
+{
+    const std::vector<int> Ms{188, 210};
+    constexpr int N = 768;
+    constexpr int K = 4096;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), N);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RCR_F16_F16_F16_LargeK, TestLargeKBatch)
+{
+    const std::vector<int> Ms{188, 210};
+    constexpr int N = 768;
+    constexpr int K = 4096;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), K);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
diff --git a/test/grouped_gemm/test_grouped_gemm_util.hpp b/test/grouped_gemm/test_grouped_gemm_util.hpp
new file mode 100644
index 000000000..b61118b51
--- /dev/null
+++ b/test/grouped_gemm/test_grouped_gemm_util.hpp
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include <string>
+#include <sstream>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/ck.hpp"
+#include "ck/stream_config.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/utility/tuple.hpp"
+#include "ck/utility/number.hpp"
+#include "profiler/profile_grouped_gemm_impl.hpp"
+
+namespace ck {
+namespace test {
+
+template <typename Range>
+std::string serialize_range(const Range& range)
+{
+    std::stringstream ss;
+    for(auto& r : range)
+    {
+        ss << r << ", ";
+    }
+    std::string str = ss.str();
+    return std::string(str.begin(), str.end() - 2);
+}
+
+template <typename Tuple>
+class TestGroupedGemm : public testing::TestWithParam<int>
+{
+    protected:
+    using ALayout   = std::tuple_element_t<0, Tuple>;
+    using BLayout   = std::tuple_element_t<1, Tuple>;
+    using ELayout   = std::tuple_element_t<2, Tuple>;
+    using ADataType = std::tuple_element_t<3, Tuple>;
+    using BDataType = std::tuple_element_t<4, Tuple>;
+    using EDataType = std::tuple_element_t<5, Tuple>;
+
+    public:
+    static constexpr bool verify_     = true;
+    static constexpr int init_method_ = 1; // decimal value initialization
+    static constexpr bool log_        = false;
+    static constexpr bool bench_      = false; // measure kernel performance
+
+    void SetUp() override {}
+
+    void Run(const std::vector<int>& Ms,
+             const std::vector<int>& Ns,
+             const std::vector<int>& Ks,
+             const std::vector<int>& StrideAs,
+             const std::vector<int>& StrideBs,
+             const std::vector<int>& StrideCs,
+             int kbatch = 1)
+    {
+        bool pass = ck::profiler::profile_grouped_gemm_impl<ADataType,
+                                                            BDataType,
+                                                            EDataType,
+                                                            float,
+                                                            ALayout,
+                                                            BLayout,
+                                                            ELayout>(
+            verify_, init_method_, log_, bench_, Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch);
+        EXPECT_TRUE(pass);
+    }
+};
+
+template <typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          ck::index_t KPerBlock,
+          ck::index_t K1,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock>
+struct DeviceGroupedGemmSplitkInstanceWrapper
+{
+    using F16         = half_t;
+    using F32         = float;
+    using Row         = ck::tensor_layout::gemm::RowMajor;
+    using Col         = ck::tensor_layout::gemm::ColumnMajor;
+    using PassThrough = tensor_operation::element_wise::PassThrough;
+
+    using EmptyTuple = ck::Tuple<>;
+
+    template <ck::index_t... Is>
+    using S = ck::Sequence<Is...>;
+
+    template <ck::index_t N>
+    using I = ck::Number<N>;
+
+    using ABlockTransferThreadClusterArrageOrder =
+        std::conditional_t<std::is_same_v<ALayout, Row>, S<0, 2, 1, 3>, S<0, 1, 3, 2>>;
+    using ABlockTransferSrcAccessOrder =
+        std::conditional_t<std::is_same_v<ALayout, Row>, S<0, 2, 1, 3>, S<0, 1, 3, 2>>;
+    using ABlockTransferSrcVectorDim = std::conditional_t<std::is_same_v<ALayout, Row>, I<3>, I<2>>;
+    using ABlockTransferDstScalarPerVector_K1 =
+        std::conditional_t<std::is_same_v<ALayout, Row>, I<8>, I<2>>;
+    using ABlockLdsAddExtraM = std::conditional_t<std::is_same_v<ALayout, Row>, I<1>, I<0>>;
+
+    using BBlockTransferThreadClusterArrageOrder =
+        std::conditional_t<std::is_same_v<BLayout, Row>, S<0, 1, 3, 2>, S<0, 2, 1, 3>>;
+    using BBlockTransferSrcAccessOrder =
+        std::conditional_t<std::is_same_v<BLayout, Row>, S<0, 1, 3, 2>, S<0, 2, 1, 3>>;
+    using BBlockTransferSrcVectorDim = std::conditional_t<std::is_same_v<BLayout, Row>, I<2>, I<3>>;
+    using BBlockTransferDstScalarPerVector_K1 =
+        std::conditional_t<std::is_same_v<ALayout, Row>, I<2>, I<8>>;
+    using BBlockLdsAddExtraM = std::conditional_t<std::is_same_v<ALayout, Row>, I<0>, I<1>>;
+
+    using DeviceGroupedGemmSplitKInstance =
+        tensor_operation::device::DeviceGroupedGemmXdlSplitKCShuffle<
+            ALayout,
+            BLayout,
+            EmptyTuple,
+            ELayout,
+            F16,
+            F16,
+            F32,
+            F16,
+            EmptyTuple,
+            F16,
+            PassThrough,
+            PassThrough,
+            PassThrough,
+            GemmSpec,
+            1,
+            128,
+            128,
+            128,
+            KPerBlock,
+            K1,
+            K1,
+            32,
+            32,
+            4,
+            2,
+            S<1, 4, 32, 1>,
+            ABlockTransferThreadClusterArrageOrder,
+            ABlockTransferSrcAccessOrder,
+            ABlockTransferSrcVectorDim::value,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_K1::value,
+            ABlockLdsAddExtraM::value,
+            S<1, 4, 32, 1>,
+            BBlockTransferThreadClusterArrageOrder,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorDim::value,
+            BBlockTransferSrcScalarPerVector,
+            BBlockTransferDstScalarPerVector_K1::value,
+            BBlockLdsAddExtraM::value,
+            1,
+            1,
+            S<1, 16, 1, 8>,
+            CDEBlockTransferScalarPerVector_NPerBlock>;
+
+    bool IsSupported(const std::vector<int>& Ms,
+                     const std::vector<int>& Ns,
+                     const std::vector<int>& Ks,
+                     const std::vector<int>& StrideAs,
+                     const std::vector<int>& StrideBs,
+                     const std::vector<int>& StrideCs,
+                     int kbatch = 1) const
+    {
+        std::size_t n_groups = Ms.size();
+        EXPECT_TRUE(Ns.size() == n_groups && Ks.size() == n_groups && StrideAs.size() == n_groups &&
+                    StrideBs.size() == n_groups && StrideCs.size() == n_groups)
+            << "The number of groups is not consistent!";
+
+        std::vector<tensor_operation::device::GemmDesc> gemm_descs;
+
+        for(std::size_t i = 0; i < n_groups; ++i)
+        {
+            gemm_descs.push_back(tensor_operation::device::GemmDesc{
+                Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});
+        }
+
+        std::vector<const void*> p_As(n_groups, nullptr);
+        std::vector<const void*> p_Bs(n_groups, nullptr);
+        std::vector<void*> p_Cs(n_groups, nullptr);
+        auto p_Ds = std::vector<std::array<const void*, 0>>{};
+
+        auto ggemm_instance = DeviceGroupedGemmSplitKInstance{};
+        auto argument       = ggemm_instance.MakeArgument(
+            p_As, p_Bs, p_Ds, p_Cs, gemm_descs, PassThrough{}, PassThrough{}, PassThrough{});
+        if(kbatch > 1)
+        {
+            ggemm_instance.SetKBatchSize(argument, kbatch);
+        }
+
+        return ggemm_instance.IsSupportedArgument(argument);
+    }
+
+    float Run(const std::vector<int>& Ms,
+              const std::vector<int>& Ns,
+              const std::vector<int>& Ks,
+              const std::vector<int>& StrideAs,
+              const std::vector<int>& StrideBs,
+              const std::vector<int>& StrideCs,
+              int kbatch = 1) const
+    {
+        std::size_t n_groups = Ms.size();
+        EXPECT_TRUE(Ns.size() == n_groups && Ks.size() == n_groups && StrideAs.size() == n_groups &&
+                    StrideBs.size() == n_groups && StrideCs.size() == n_groups)
+            << "The number of groups is not consistent!";
+
+        std::vector<tensor_operation::device::GemmDesc> gemm_descs;
+
+        for(std::size_t i = 0; i < n_groups; ++i)
+        {
+            gemm_descs.push_back(tensor_operation::device::GemmDesc{
+                Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});
+        }
+
+        std::vector<const void*> p_As(n_groups, nullptr);
+        std::vector<const void*> p_Bs(n_groups, nullptr);
+        std::vector<void*> p_Cs(n_groups, nullptr);
+        auto p_Ds = std::vector<std::array<const void*, 0>>{};
+
+        auto ggemm_instance = DeviceGroupedGemmSplitKInstance{};
+        auto argument       = ggemm_instance.MakeArgument(
+            p_As, p_Bs, p_Ds, p_Cs, gemm_descs, PassThrough{}, PassThrough{}, PassThrough{});
+        if(kbatch > 1)
+        {
+            ggemm_instance.SetKBatchSize(argument, kbatch);
+        }
+
+        EXPECT_TRUE(ggemm_instance.IsSupportedArgument(argument));
+        auto invoker = ggemm_instance.MakeInvoker();
+        DeviceMem gemm_desc_workspace(ggemm_instance.GetWorkSpaceSize(&argument));
+        ggemm_instance.SetWorkSpacePointer(&argument, gemm_desc_workspace.GetDeviceBuffer());
+        return invoker.Run(argument, StreamConfig{nullptr, false});
+    }
+};
+
+} // namespace test
+} // namespace ck
-- 
GitLab


From 1344a0f25b5da1b74267cbe3adf97b084c05708f Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Tue, 30 May 2023 20:09:55 +0800
Subject: [PATCH 45/71] Simplify kernel argument of device operator
 DeviceGemm_Xdl_CShuffle<> (#696)

* Remove M/N/KPad local variables

* Use M/N/KPad to name padded lengths

* Replace duplicated local variable by parameters

* Rename variables M/N/KRaw to M/N/K

* Move AK0/BK0 compute logic into GridwiseGemm

* Use macro to shorten code

* Move CalculateGridSize() logic into GridwiseGemm

* Add comment to credit the implementation source

* Reuse the existing implementation

* Remove no-longer used data members

* Remove elementwise-op objects from interfaces

* Reserve kernel arg as whole object in interfaces

* Remove redundant data member

* Make 3rd type parameter optional

* Remove unnesscary type parameters

* Remove no-longer used descriptor-creation methods

* Move kernel arg type definition into GridwiseGemm

* Add macro to switch between code sections

* Move argument field computing logic into device op side

* Make utility method 'static'

* Declare special methods

* Unify MakeArgument() usage

* Adapt the new GridwiseGemm interface

* Push-down class 'GridwiseGemm::Argument' fields

* Remove no-longer used methods

* Add unused parameters

* Force copying parameters in 'Embed' ctor

* Remove no-longer used descriptors

* Fallback change on BaseArgument

* Remove macro 'INTEGER_DIVIDE_CEIL'

* Make variable naming more consistent

* Make sure methods are only invoked on right place

* Remove tailing underscore in public attribute name

* Remove necessary methods

* Hide computing logic of derived attributes

* Make new 'Embed' ctor only available for device code

* Make sure 'Embed' type args are not references

* Move check for karg.K into CheckValidity()

* Remove more integer division logic form device code

* Undo changes on Embed

* Separate 'Problem' concept out from 'Argument'

* Share same name for kernel interfaces

* Reject unsupported argument

---------

Co-authored-by: zjing14 <zhangjing14@gmail.com>
---
 .../impl/device_cgemm_4gemm_xdl_cshuffle.hpp  | 757 +++++-------------
 .../device/impl/device_gemm_xdl_cshuffle.hpp  | 488 +----------
 .../gpu/grid/block_to_ctile_map.hpp           |  68 +-
 .../grid/gridwise_gemm_xdl_cshuffle_v1.hpp    | 628 ++++++++++++---
 4 files changed, 802 insertions(+), 1139 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
index 29978458b..0c9cce97f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
@@ -118,277 +118,11 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
         return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
     }
 
-    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
-    {
-        const auto a_grid_desc_mraw_kraw = [&]() {
-            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
-                                                    make_tuple(StrideA, I1));
-            }
-            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
-                                                    make_tuple(I1, StrideA));
-            }
-        }();
-
-        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
-        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
-
-        const auto MPad = M - MRaw;
-        const auto KPad = K - KRaw;
-
-        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad both M and K
-            assert(K % AK1 == 0);
-
-            const auto AK0 = K / AK1;
-
-            const auto a_grid_desc_m_k =
-                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
-                                            make_tuple(make_right_pad_transform(MRaw, MPad),
-                                                       make_right_pad_transform(KRaw, KPad)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_m_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_pass_through_transform(M)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MNPadding)
-        {
-            // pad M, but not K
-            assert(KRaw % AK1 == 0);
-
-            const auto AK0 = KRaw / AK1;
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_right_pad_transform(MRaw, MPad)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad K, but not M
-            assert(K % AK1 == 0);
-
-            const auto AK0 = K / AK1;
-
-            const auto a_grid_desc_m_k = transform_tensor_descriptor(
-                a_grid_desc_mraw_kraw,
-                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_m_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_pass_through_transform(MRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else
-        {
-            // not pad M or K
-            assert(KRaw % AK1 == 0);
-
-            const auto AK0 = KRaw / AK1;
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_pass_through_transform(MRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-    }
-
-    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
-    {
-        const auto b_grid_desc_nraw_kraw = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
-                                                    make_tuple(I1, StrideB));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
-                                                    make_tuple(StrideB, I1));
-            }
-        }();
-
-        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
-        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
-
-        const auto NPad = N - NRaw;
-        const auto KPad = K - KRaw;
-
-        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad both N and K
-            assert(K % BK1 == 0);
-
-            const auto BK0 = K / BK1;
-
-            const auto b_grid_desc_n_k =
-                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
-                                            make_tuple(make_right_pad_transform(NRaw, NPad),
-                                                       make_right_pad_transform(KRaw, KPad)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_n_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(N)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::MNPadding)
-        {
-            // pad N, but not K
-            assert(KRaw % BK1 == 0);
-
-            const auto BK0 = KRaw / BK1;
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_right_pad_transform(NRaw, NPad)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad K, but not N
-            assert(K % BK1 == 0);
-
-            const auto BK0 = K / BK1;
-
-            const auto b_grid_desc_n_k = transform_tensor_descriptor(
-                b_grid_desc_nraw_kraw,
-                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_n_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(NRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else
-        {
-            // not pad N or K
-            assert(KRaw % BK1 == 0);
-
-            const auto BK0 = KRaw / BK1;
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(NRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-    }
-
-    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
-    {
-        const auto c_grid_desc_mraw_nraw = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
-                                                    make_tuple(StrideC, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
-                                                    make_tuple(I1, StrideC));
-            }
-        }();
-
-        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
-        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
-
-        const auto MPad = M - MRaw;
-        const auto NPad = N - NRaw;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(MRaw, MPad),
-                                                          make_right_pad_transform(NRaw, NPad)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-    }
-
-    using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
-    using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
-    using CGridDesc_M_N       = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
-    using CGridDesc_M         = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
-
     // GridwiseGemm
     using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ALayout,
+        BLayout,
+        CLayout,
         ADataType, // TODO: distinguish A/B datatype
         GemmAccDataType,
         CShuffleDataType,
@@ -396,10 +130,8 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
         AElementwiseOperation,
         BElementwiseOperation,
         CElementwiseOperation,
+        GemmSpec,
         InMemoryDataOperationEnum::Set,
-        AGridDesc_AK0_M_AK1,
-        BGridDesc_BK0_N_BK1,
-        CGridDesc_M_N,
         NumGemmKPrefetchStage,
         BlockSize,
         MPerBlock,
@@ -433,108 +165,82 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
         CShuffleBlockTransferScalarPerVector_NPerBlock,
         LoopSched>;
 
+    using CGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
+
     // Argument
-    struct Argument : public BaseArgument
+    struct Argument : public tensor_operation::device::BaseArgument, public GridwiseGemm::Problem
     {
-        Argument(const ADataType* p_a_grid_real,
-                 const ADataType* p_a_grid_imag,
-                 const BDataType* p_b_grid_real,
-                 const BDataType* p_b_grid_imag,
-                 CDataType* p_c_grid_real,
-                 CDataType* p_c_grid_imag,
+        using Problem = typename GridwiseGemm::Problem;
+
+        Argument(const ADataType* p_a_grid_real_,
+                 const ADataType* p_a_grid_imag_,
+                 const BDataType* p_b_grid_real_,
+                 const BDataType* p_b_grid_imag_,
+                 CDataType* p_c_grid_real_,
+                 CDataType* p_c_grid_imag_,
                  CDataType* p_workspace,
-                 index_t MRaw,
-                 index_t NRaw,
-                 index_t KRaw,
-                 index_t StrideA,
-                 index_t StrideB,
-                 index_t StrideC,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-            : p_a_grid_real_{p_a_grid_real},
-              p_a_grid_imag_{p_a_grid_imag},
-              p_b_grid_real_{p_b_grid_real},
-              p_b_grid_imag_{p_b_grid_imag},
-              p_c_grid_real_{p_c_grid_real},
-              p_c_grid_imag_{p_c_grid_imag},
-              p_aux_grid_{p_workspace},
-              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
-              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
-              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
-              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op}
+                 index_t M_,
+                 index_t N_,
+                 index_t K_,
+                 index_t StrideA_,
+                 index_t StrideB_,
+                 index_t StrideC_)
+            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_},
+              p_a_grid_real{p_a_grid_real_},
+              p_a_grid_imag{p_a_grid_imag_},
+              p_b_grid_real{p_b_grid_real_},
+              p_b_grid_imag{p_b_grid_imag_},
+              p_c_grid_real{p_c_grid_real_},
+              p_c_grid_imag{p_c_grid_imag_},
+              p_aux_grid{p_workspace}
         {
-            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
-                                           b_grid_desc_bk0_n_bk1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        c_grid_desc_m_n_);
-            }
-
-            const index_t grid_size = block_2_ctile_map_.CalculateGridSize(c_grid_desc_m_n_);
+            const index_t grid_size = std::get<1>(GridwiseGemm::CalculateGridSize(M_, N_));
 
             if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
             {
-                c_grid_desc_m_ =
-                    DeviceOp::MakeDescriptor_M({MRaw, NRaw}, {StrideC, I1}, grid_size, BlockSize);
+                c_grid_desc_m =
+                    DeviceOp::MakeDescriptor_M({M_, N_}, {StrideC_, I1}, grid_size, BlockSize);
             }
             else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
             {
-                c_grid_desc_m_ =
-                    DeviceOp::MakeDescriptor_M({MRaw, NRaw}, {I1, StrideC}, grid_size, BlockSize);
+                c_grid_desc_m =
+                    DeviceOp::MakeDescriptor_M({M_, N_}, {I1, StrideC_}, grid_size, BlockSize);
             }
 
-            p_aux_2_grid_ = p_workspace + c_grid_desc_m_n_.GetElementSpaceSize();
+            p_aux_2_grid = p_workspace + GetCElementSpaceSize(M_, N_, StrideC_);
         }
 
         //  private:
-        const ADataType* p_a_grid_real_;
-        const ADataType* p_a_grid_imag_;
-        const BDataType* p_b_grid_real_;
-        const BDataType* p_b_grid_imag_;
-        CDataType* p_c_grid_real_;
-        CDataType* p_c_grid_imag_;
-        CDataType* p_aux_grid_;
-        CDataType* p_aux_2_grid_;
-        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
-        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        CGridDesc_M_N c_grid_desc_m_n_;
-        CGridDesc_M c_grid_desc_m_;
-        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
+        const ADataType* p_a_grid_real;
+        const ADataType* p_a_grid_imag;
+        const BDataType* p_b_grid_real;
+        const BDataType* p_b_grid_imag;
+        CDataType* p_c_grid_real;
+        CDataType* p_c_grid_imag;
+        CDataType* p_aux_grid;
+        CDataType* p_aux_2_grid;
+        CGridDesc_M c_grid_desc_m;
     };
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        using Argument = DeviceOp::Argument;
-
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                            arg.b_grid_desc_bk0_n_bk1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
             {
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
 
-            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N);
 
-            const auto K =
-                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+            const auto K = GridwiseGemm::CalculateAK0(arg.K) * AK1;
 
             float ave_time = 0;
 
@@ -578,224 +284,148 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
 
             if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
             {
-                const auto kernel = kernel_gemm_xdl_cshuffle_v1<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    DeviceOp::AGridDesc_AK0_M_AK1,
-                    DeviceOp::BGridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::DefaultBlock2CTileMap,
-                    true>;
-
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_real_,
-                                           arg.p_b_grid_real_,
-                                           arg.p_aux_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
-
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_imag_,
-                                           arg.p_b_grid_imag_,
-                                           arg.p_aux_2_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
+                const auto kernel =
+                    kernel_gemm_xdl_cshuffle_v1<GridwiseGemm, ADataType, CDataType, true>;
+
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_real,
+                                                   arg.p_b_grid_real,
+                                                   arg.p_aux_grid,
+                                                   arg);
+
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_imag,
+                                                   arg.p_b_grid_imag,
+                                                   arg.p_aux_2_grid,
+                                                   arg);
 
                 // c_real = aux - aux_2
                 ave_time += launch_and_time_kernel(
                     stream_config,
                     subtract_kernel,
-                    dim3(grid_size),
+                    dim3(gdx, gdy, gdz),
                     dim3(BlockSize),
                     0,
-                    make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
-                    make_tuple(arg.c_grid_desc_m_),
-                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
-                               const_cast<const CDataType*>(arg.p_aux_2_grid_)),
-                    make_tuple(arg.p_c_grid_real_),
+                    make_tuple(arg.c_grid_desc_m, arg.c_grid_desc_m),
+                    make_tuple(arg.c_grid_desc_m),
+                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid),
+                               const_cast<const CDataType*>(arg.p_aux_2_grid)),
+                    make_tuple(arg.p_c_grid_real),
                     Subtract{});
 
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_real_,
-                                           arg.p_b_grid_imag_,
-                                           arg.p_aux_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
-
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_imag_,
-                                           arg.p_b_grid_real_,
-                                           arg.p_aux_2_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_real,
+                                                   arg.p_b_grid_imag,
+                                                   arg.p_aux_grid,
+                                                   arg);
+
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_imag,
+                                                   arg.p_b_grid_real,
+                                                   arg.p_aux_2_grid,
+                                                   arg);
 
                 // c_imag = aux + aux_2
                 ave_time += launch_and_time_kernel(
                     stream_config,
                     add_kernel,
-                    dim3(grid_size),
+                    dim3(gdx, gdy, gdz),
                     dim3(BlockSize),
                     0,
-                    make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
-                    make_tuple(arg.c_grid_desc_m_),
-                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
-                               const_cast<const CDataType*>(arg.p_aux_2_grid_)),
-                    make_tuple(arg.p_c_grid_imag_),
+                    make_tuple(arg.c_grid_desc_m, arg.c_grid_desc_m),
+                    make_tuple(arg.c_grid_desc_m),
+                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid),
+                               const_cast<const CDataType*>(arg.p_aux_2_grid)),
+                    make_tuple(arg.p_c_grid_imag),
                     Add{});
             }
             else
             {
-                const auto kernel = kernel_gemm_xdl_cshuffle_v1<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    DeviceOp::AGridDesc_AK0_M_AK1,
-                    DeviceOp::BGridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::DefaultBlock2CTileMap,
-                    false>;
-
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_real_,
-                                           arg.p_b_grid_real_,
-                                           arg.p_aux_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
-
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_imag_,
-                                           arg.p_b_grid_imag_,
-                                           arg.p_aux_2_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
+                const auto kernel =
+                    kernel_gemm_xdl_cshuffle_v1<GridwiseGemm, ADataType, CDataType, false>;
+
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_real,
+                                                   arg.p_b_grid_real,
+                                                   arg.p_aux_grid,
+                                                   arg);
+
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_imag,
+                                                   arg.p_b_grid_imag,
+                                                   arg.p_aux_2_grid,
+                                                   arg);
 
                 // c_real = aux - aux_2
                 ave_time += launch_and_time_kernel(
                     stream_config,
                     subtract_kernel,
-                    dim3(grid_size),
+                    dim3(gdx, gdy, gdz),
                     dim3(BlockSize),
                     0,
-                    make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
-                    make_tuple(arg.c_grid_desc_m_),
-                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
-                               const_cast<const CDataType*>(arg.p_aux_2_grid_)),
-                    make_tuple(arg.p_c_grid_real_),
+                    make_tuple(arg.c_grid_desc_m, arg.c_grid_desc_m),
+                    make_tuple(arg.c_grid_desc_m),
+                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid),
+                               const_cast<const CDataType*>(arg.p_aux_2_grid)),
+                    make_tuple(arg.p_c_grid_real),
                     Subtract{});
 
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_real_,
-                                           arg.p_b_grid_imag_,
-                                           arg.p_aux_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
-
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_imag_,
-                                           arg.p_b_grid_real_,
-                                           arg.p_aux_2_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_real,
+                                                   arg.p_b_grid_imag,
+                                                   arg.p_aux_grid,
+                                                   arg);
+
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_imag,
+                                                   arg.p_b_grid_real,
+                                                   arg.p_aux_2_grid,
+                                                   arg);
 
                 // c_imag = aux + aux_2
                 ave_time += launch_and_time_kernel(
                     stream_config,
                     add_kernel,
-                    dim3(grid_size),
+                    dim3(gdx, gdy, gdz),
                     dim3(BlockSize),
                     0,
-                    make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
-                    make_tuple(arg.c_grid_desc_m_),
-                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
-                               const_cast<const CDataType*>(arg.p_aux_2_grid_)),
-                    make_tuple(arg.p_c_grid_imag_),
+                    make_tuple(arg.c_grid_desc_m, arg.c_grid_desc_m),
+                    make_tuple(arg.c_grid_desc_m),
+                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid),
+                               const_cast<const CDataType*>(arg.p_aux_2_grid)),
+                    make_tuple(arg.p_c_grid_imag),
                     Add{});
             }
 
@@ -818,10 +448,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        return GridwiseGemm::CheckValidity(arg);
     }
 
     // polymorphic
@@ -837,15 +464,15 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                              CDataType* p_c_real,
                              CDataType* p_c_imag,
                              CDataType* p_workspace,
-                             index_t MRaw,
-                             index_t NRaw,
-                             index_t KRaw,
+                             index_t M,
+                             index_t N,
+                             index_t K,
                              index_t StrideA,
                              index_t StrideB,
                              index_t StrideC,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CElementwiseOperation)
     {
         return Argument{p_a_real,
                         p_a_imag,
@@ -854,15 +481,12 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                         p_c_real,
                         p_c_imag,
                         p_workspace,
-                        MRaw,
-                        NRaw,
-                        KRaw,
+                        M,
+                        N,
+                        K,
                         StrideA,
                         StrideB,
-                        StrideC,
-                        a_element_op,
-                        b_element_op,
-                        c_element_op};
+                        StrideC};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -875,15 +499,15 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                       void* p_c_real,
                                                       void* p_c_imag,
                                                       void* p_workspace,
-                                                      index_t MRaw,
-                                                      index_t NRaw,
-                                                      index_t KRaw,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
                                                       index_t StrideA,
                                                       index_t StrideB,
                                                       index_t StrideC,
-                                                      AElementwiseOperation a_element_op,
-                                                      BElementwiseOperation b_element_op,
-                                                      CElementwiseOperation c_element_op,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation,
                                                       index_t /* KBatch */ = 1) override
     {
         return std::make_unique<Argument>(static_cast<const ADataType*>(p_a_real),
@@ -893,15 +517,12 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                           static_cast<CDataType*>(p_c_real),
                                           static_cast<CDataType*>(p_c_imag),
                                           static_cast<CDataType*>(p_workspace),
-                                          MRaw,
-                                          NRaw,
-                                          KRaw,
+                                          M,
+                                          N,
+                                          K,
                                           StrideA,
                                           StrideB,
-                                          StrideC,
-                                          a_element_op,
-                                          b_element_op,
-                                          c_element_op);
+                                          StrideC);
     }
 
     // polymorphic
@@ -930,16 +551,22 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
         return str.str();
     }
 
-    std::size_t GetWorkspaceSize(index_t MRaw,
-                                 index_t NRaw,
-                                 [[maybe_unused]] index_t KRaw,
+    static std::size_t GetCElementSpaceSize(index_t M, index_t N, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = GridwiseGemm::MakeCGridDescriptor_M_N(
+            M, GridwiseGemm::CalculateMPadded(M), N, GridwiseGemm::CalculateNPadded(N), StrideC);
+
+        return c_grid_desc_m_n.GetElementSpaceSize();
+    }
+
+    std::size_t GetWorkspaceSize(index_t M,
+                                 index_t N,
+                                 [[maybe_unused]] index_t K,
                                  [[maybe_unused]] index_t StrideA,
                                  [[maybe_unused]] index_t StrideB,
                                  index_t StrideC) override
     {
-        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC);
-
-        return 2 * sizeof(CDataType) * c_grid_desc_m_n.GetElementSpaceSize();
+        return 2 * sizeof(CDataType) * GetCElementSpaceSize(M, N, StrideC);
     }
 };
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
index 7cd0ff72e..13a30911a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
@@ -82,276 +82,11 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
 
-    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
-    {
-        const auto a_grid_desc_mraw_kraw = [&]() {
-            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
-                                                    make_tuple(StrideA, I1));
-            }
-            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
-                                                    make_tuple(I1, StrideA));
-            }
-        }();
-
-        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
-        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
-
-        const auto MPad = M - MRaw;
-        const auto KPad = K - KRaw;
-
-        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad both M and K
-            assert(K % AK1 == 0);
-
-            const auto AK0 = K / AK1;
-
-            const auto a_grid_desc_m_k =
-                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
-                                            make_tuple(make_right_pad_transform(MRaw, MPad),
-                                                       make_right_pad_transform(KRaw, KPad)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_m_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_pass_through_transform(M)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MNPadding)
-        {
-            // pad M, but not K
-            assert(KRaw % AK1 == 0);
-
-            const auto AK0 = KRaw / AK1;
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_right_pad_transform(MRaw, MPad)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad K, but not M
-            assert(K % AK1 == 0);
-
-            const auto AK0 = K / AK1;
-
-            const auto a_grid_desc_m_k = transform_tensor_descriptor(
-                a_grid_desc_mraw_kraw,
-                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_m_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_pass_through_transform(MRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else
-        {
-            // not pad M or K
-            assert(KRaw % AK1 == 0);
-
-            const auto AK0 = KRaw / AK1;
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_pass_through_transform(MRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-    }
-
-    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
-    {
-        const auto b_grid_desc_nraw_kraw = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
-                                                    make_tuple(I1, StrideB));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
-                                                    make_tuple(StrideB, I1));
-            }
-        }();
-
-        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
-        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
-
-        const auto NPad = N - NRaw;
-        const auto KPad = K - KRaw;
-
-        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad both N and K
-            assert(K % BK1 == 0);
-
-            const auto BK0 = K / BK1;
-
-            const auto b_grid_desc_n_k =
-                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
-                                            make_tuple(make_right_pad_transform(NRaw, NPad),
-                                                       make_right_pad_transform(KRaw, KPad)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_n_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(N)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::MNPadding)
-        {
-            // pad N, but not K
-            assert(KRaw % BK1 == 0);
-
-            const auto BK0 = KRaw / BK1;
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_right_pad_transform(NRaw, NPad)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad K, but not N
-            assert(K % BK1 == 0);
-
-            const auto BK0 = K / BK1;
-
-            const auto b_grid_desc_n_k = transform_tensor_descriptor(
-                b_grid_desc_nraw_kraw,
-                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_n_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(NRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else
-        {
-            // not pad N or K
-            assert(KRaw % BK1 == 0);
-
-            const auto BK0 = KRaw / BK1;
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(NRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-    }
-
-    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
-    {
-        const auto c_grid_desc_mraw_nraw = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
-                                                    make_tuple(StrideC, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
-                                                    make_tuple(I1, StrideC));
-            }
-        }();
-
-        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
-        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
-
-        const auto MPad = M - MRaw;
-        const auto NPad = N - NRaw;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(MRaw, MPad),
-                                                          make_right_pad_transform(NRaw, NPad)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-    }
-
-    using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
-    using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
-    using CGridDesc_M_N       = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
-
     // GridwiseGemm
     using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ALayout,
+        BLayout,
+        CLayout,
         ADataType, // TODO: distinguish A/B datatype
         GemmAccDataType,
         CShuffleDataType,
@@ -359,10 +94,8 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
         AElementwiseOperation,
         BElementwiseOperation,
         CElementwiseOperation,
+        GemmSpec,
         InMemoryDataOperationEnum::Set,
-        AGridDesc_AK0_M_AK1,
-        BGridDesc_BK0_N_BK1,
-        CGridDesc_M_N,
         NumGemmKPrefetchStage,
         BlockSize,
         MPerBlock,
@@ -397,162 +130,43 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
         LoopSched,
         PipelineVer>;
 
-    // Argument
-    struct Argument : public BaseArgument
-    {
-        Argument(const ADataType* p_a_grid,
-                 const BDataType* p_b_grid,
-                 CDataType* p_c_grid,
-                 index_t MRaw,
-                 index_t NRaw,
-                 index_t KRaw,
-                 index_t StrideA,
-                 index_t StrideB,
-                 index_t StrideC,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-            : p_a_grid_{p_a_grid},
-              p_b_grid_{p_b_grid},
-              p_c_grid_{p_c_grid},
-              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
-              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
-              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
-              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op},
-              kraw_{KRaw}
-        {
-            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
-                                           b_grid_desc_bk0_n_bk1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        c_grid_desc_m_n_);
-            }
-        }
-
-        //  private:
-        const ADataType* p_a_grid_;
-        const BDataType* p_b_grid_;
-        CDataType* p_c_grid_;
-        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
-        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        CGridDesc_M_N c_grid_desc_m_n_;
-        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
-        index_t kraw_;
-    };
+    using Argument = typename GridwiseGemm::Argument;
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        using Argument = DeviceOp::Argument;
-
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-#if DEBUG_LOG
+            if(stream_config.log_level_ > 0)
             {
-                std::cout << "arg.a_grid_desc_ak0_m_ak1_{"
-                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) << ", "
-                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) << ", "
-                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.b_grid_desc_bk0_n_bk1_{"
-                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I0) << ", "
-                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) << ", "
-                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
-                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+                arg.Print();
             }
-#endif
 
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                            arg.b_grid_desc_bk0_n_bk1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
+            if(!GridwiseGemm::CheckValidity(arg))
             {
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
 
-            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
-            const auto K =
-                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N);
+
+            const auto K = GridwiseGemm::CalculateAK0(arg.K) * AK1;
 
             float ave_time = 0;
 
             if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
             {
-                const auto kernel = kernel_gemm_xdl_cshuffle_v1<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    DeviceOp::AGridDesc_AK0_M_AK1,
-                    DeviceOp::BGridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::DefaultBlock2CTileMap,
-                    true>;
-
-                ave_time =
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_,
-                                           arg.p_b_grid_,
-                                           arg.p_c_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<GridwiseGemm, true>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
             }
             else
             {
-                const auto kernel = kernel_gemm_xdl_cshuffle_v1<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    DeviceOp::AGridDesc_AK0_M_AK1,
-                    DeviceOp::BGridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::DefaultBlock2CTileMap,
-                    false>;
-                ave_time =
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_,
-                                           arg.p_b_grid_,
-                                           arg.p_c_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<GridwiseGemm, false>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
             }
 
             return ave_time;
@@ -580,19 +194,15 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
             return false;
         }
 
-        if((arg.kraw_ % AK1 != 0 || arg.kraw_ % BK1 != 0) &&
-           !(GemmSpec == GemmSpecialization::MKPadding ||
-             GemmSpec == GemmSpecialization::NKPadding ||
-             GemmSpec == GemmSpecialization::MNKPadding ||
-             GemmSpec == GemmSpecialization::KPadding))
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
         {
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        return GridwiseGemm::CheckValidity(arg);
     }
 
     // polymorphic
@@ -604,28 +214,17 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
     static auto MakeArgument(const ADataType* p_a,
                              const BDataType* p_b,
                              CDataType* p_c,
-                             index_t MRaw,
-                             index_t NRaw,
-                             index_t KRaw,
+                             index_t M,
+                             index_t N,
+                             index_t K,
                              index_t StrideA,
                              index_t StrideB,
                              index_t StrideC,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CElementwiseOperation)
     {
-        return Argument{p_a,
-                        p_b,
-                        p_c,
-                        MRaw,
-                        NRaw,
-                        KRaw,
-                        StrideA,
-                        StrideB,
-                        StrideC,
-                        a_element_op,
-                        b_element_op,
-                        c_element_op};
+        return Argument{p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -634,28 +233,25 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
     std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
                                                       const void* p_b,
                                                       void* p_c,
-                                                      index_t MRaw,
-                                                      index_t NRaw,
-                                                      index_t KRaw,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
                                                       index_t StrideA,
                                                       index_t StrideB,
                                                       index_t StrideC,
-                                                      AElementwiseOperation a_element_op,
-                                                      BElementwiseOperation b_element_op,
-                                                      CElementwiseOperation c_element_op) override
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation) override
     {
         return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                           static_cast<const BDataType*>(p_b),
                                           static_cast<CDataType*>(p_c),
-                                          MRaw,
-                                          NRaw,
-                                          KRaw,
+                                          M,
+                                          N,
+                                          K,
                                           StrideA,
                                           StrideB,
-                                          StrideC,
-                                          a_element_op,
-                                          b_element_op,
-                                          c_element_op);
+                                          StrideC);
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
index 9bd860f39..ad91c3c68 100644
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -109,30 +109,37 @@ struct BlockToCTileMap_M00_N0_M01
 
 // Rows of column-vectors
 // This C-tile map dynamically adjusts M01 when C-tile index is out of range
-template <index_t MPerBlock, index_t NPerBlock, typename CGridDesc_M_N>
-struct BlockToCTileMap_M00_N0_M01Adapt
+template <index_t MPerBlock, index_t NPerBlock, typename CGridDesc_M_N = void>
+struct BlockToCTileMap_M00_N0_M01Adapt;
+
+template <index_t MPerBlock, index_t NPerBlock>
+struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
-    static constexpr auto I2 = Number<2>{};
-    static constexpr auto I3 = Number<3>{};
 
     __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt() = default;
 
-    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(const CGridDesc_M_N& c_grid_desc_m_n,
-                                                        index_t M01 = 8)
-        : M01_(M01), c_grid_desc_m_n_(c_grid_desc_m_n)
+    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(const BlockToCTileMap_M00_N0_M01Adapt&) =
+        default;
+    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(BlockToCTileMap_M00_N0_M01Adapt&&) =
+        default;
+    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt&
+    operator=(const BlockToCTileMap_M00_N0_M01Adapt&) = default;
+    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt&
+    operator=(BlockToCTileMap_M00_N0_M01Adapt&&) = default;
+
+    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(index_t M, index_t N, index_t M01 = 8)
+        : M_(M), N_(N), M01_(M01)
     {
     }
 
-    __host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n) const
+    __host__ static constexpr index_t CalculateGridSize(index_t M, index_t N)
     {
-        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
-        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I1), NPerBlock);
-
-        const index_t grid_size = M0 * N0;
+        const auto M0 = math::integer_divide_ceil(M, MPerBlock);
+        const auto N0 = math::integer_divide_ceil(N, NPerBlock);
 
-        return grid_size;
+        return M0 * N0;
     }
 
     template <typename TopIdx>
@@ -140,8 +147,8 @@ struct BlockToCTileMap_M00_N0_M01Adapt
     {
         auto block_1d_id = idx_top[I0];
 
-        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n_.GetLength(I0), MPerBlock);
-        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n_.GetLength(I1), NPerBlock);
+        const auto M0 = math::integer_divide_ceil(M_, MPerBlock);
+        const auto N0 = math::integer_divide_ceil(N_, NPerBlock);
 
         block_1d_id = block_1d_id % (M0 * N0); // swallow batch index
 
@@ -209,11 +216,36 @@ struct BlockToCTileMap_M00_N0_M01Adapt
         return true; // always valid provided that user gets grid size from CalculateGridSize()
     }
 
-    __host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const { return true; }
-
     private:
+    index_t M_;
+    index_t N_;
     index_t M01_;
-    CGridDesc_M_N c_grid_desc_m_n_;
+};
+
+template <index_t MPerBlock, index_t NPerBlock, typename CGridDesc_M_N>
+struct BlockToCTileMap_M00_N0_M01Adapt : BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
+{
+    using Parent = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>;
+
+    using Parent::I0;
+    using Parent::I1;
+
+    using Parent::Parent;
+    using Parent::operator=;
+
+    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(const CGridDesc_M_N& c_grid_desc_m_n,
+                                                        index_t M01 = 8)
+        : Parent(c_grid_desc_m_n.GetLength(I0), c_grid_desc_m_n.GetLength(I1), M01)
+    {
+    }
+
+    __host__ static constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        return Parent::CalculateGridSize(c_grid_desc_m_n.GetLength(I0),
+                                         c_grid_desc_m_n.GetLength(I1));
+    }
+
+    __host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const { return true; }
 };
 
 // 2D slices of column-vectors in 3D space
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
index 1213cdc26..e6303d76c 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -17,17 +17,25 @@
 
 namespace ck {
 
-template <typename GridwiseGemm,
-          typename FloatAB,
-          typename FloatC,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation,
-          typename AGridDesc_AK0_M_AK1,
-          typename BGridDesc_BK0_N_BK1,
-          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename Block2CTileMap,
-          bool HasMainKBlockLoop>
+template <typename GridwiseGemm, bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdl_cshuffle_v1(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(
+        karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared, karg);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <typename GridwiseGemm, typename FloatAB, typename FloatC, bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
@@ -35,55 +43,33 @@ __global__ void
         kernel_gemm_xdl_cshuffle_v1(const FloatAB* __restrict__ p_a_grid,
                                     const FloatAB* __restrict__ p_b_grid,
                                     FloatC* __restrict__ p_c_grid,
-                                    const AElementwiseOperation a_element_op,
-                                    const BElementwiseOperation b_element_op,
-                                    const CElementwiseOperation c_element_op,
-                                    const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-                                    const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-                                    const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                    const Block2CTileMap block_2_ctile_map)
+                                    typename GridwiseGemm::Problem problem)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_c_grid,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_ctile_map);
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid, p_b_grid, p_c_grid, p_shared, problem);
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
     ignore = p_c_grid;
-    ignore = a_element_op;
-    ignore = b_element_op;
-    ignore = c_element_op;
-    ignore = a_grid_desc_ak0_m_ak1;
-    ignore = b_grid_desc_bk0_n_bk1;
-    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore = block_2_ctile_map;
+    ignore = problem;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 
-template <typename FloatAB,
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename FloatAB,
           typename FloatGemmAcc,
           typename FloatCShuffle,
           typename FloatC,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AGridDesc_AK0_M_AK1,
-          typename BGridDesc_BK0_N_BK1,
-          typename CGridDesc_M_N,
           index_t NumGemmKPrefetchStage,
           index_t BlockSize,
           index_t MPerBlock,
@@ -129,35 +115,396 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
     static constexpr auto I7 = Number<7>{};
 
     // K1 should be Number<...>
-    static constexpr auto AK0 = Number<KPerBlock / AK1Value>{};
-    static constexpr auto BK0 = Number<KPerBlock / BK1Value>{};
-    static constexpr auto AK1 = Number<AK1Value>{};
-    static constexpr auto BK1 = Number<BK1Value>{};
+    static constexpr auto AK0Number = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number = Number<AK1Value>{};
+    static constexpr auto BK1Number = Number<BK1Value>{};
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
+    __host__ static auto CalculateGridSize(index_t M, index_t N)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, 1);
+    }
+
+    __host__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock) * MPerBlock;
+    }
+
+    __host__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock) * NPerBlock;
+    }
+
+    __host__ static auto CalculateKPadded(index_t K)
+    {
+        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
+    }
+
+    __host__ static auto CalculateAK0(index_t K)
+    {
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding ||
+                     GemmSpec == GemmSpecialization::KPadding ||
+                     GemmSpec == GemmSpecialization::NKPadding)
+        {
+            return CalculateKPadded(K) / AK1Value;
+        }
+        else
+        {
+            return K / AK1Value;
+        }
+    }
+
+    __host__ static auto CalculateBK0(index_t K)
+    {
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding ||
+                     GemmSpec == GemmSpecialization::KPadding ||
+                     GemmSpec == GemmSpecialization::MKPadding)
+        {
+            return CalculateKPadded(K) / BK1Value;
+        }
+        else
+        {
+            return K / BK1Value;
+        }
+    }
+
+    __host__ static auto CalculateMBlock(index_t M)
+    {
+        return math::integer_divide_floor(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNBlock(index_t N)
+    {
+        return math::integer_divide_floor(N, NPerBlock);
+    }
+
+    __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+        index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(M, MPad - M),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(MPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(N, NPad - N),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(NPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    __host__ __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(M, MPad - M),
+                                                          make_right_pad_transform(N, NPad - N)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+    }
+
+    struct Problem
+    {
+        __host__ Problem(index_t M_,
+                         index_t N_,
+                         index_t K_,
+                         index_t StrideA_,
+                         index_t StrideB_,
+                         index_t StrideC_)
+            : M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideB{StrideB_},
+              StrideC{StrideC_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              KPadded{CalculateKPadded(K_)},
+              AK0{CalculateAK0(K_)},
+              BK0{CalculateBK0(K_)},
+              MBlock{CalculateMBlock(M_)},
+              NBlock{CalculateNBlock(N_)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << "}" << std::endl;
+        }
+
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        index_t StrideC;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KPadded;
+        index_t AK0;
+        index_t BK0;
+        index_t MBlock;
+        index_t NBlock;
+    };
+
+    // Argument
+    struct Argument : public tensor_operation::device::BaseArgument, public Problem
+    {
+        __host__ Argument(const FloatAB* p_a_grid_,
+                          const FloatAB* p_b_grid_,
+                          FloatC* p_c_grid_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideB_,
+                          index_t StrideC_)
+            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_},
+              p_a_grid{p_a_grid_},
+              p_b_grid{p_b_grid_},
+              p_c_grid{p_c_grid_}
+        {
+        }
+
+        const FloatAB* p_a_grid;
+        const FloatAB* p_b_grid;
+        FloatC* p_c_grid;
+    };
+
     // FIXME: pass GridwiseGemmPipe as a template arguement into GridwiseGemm
     using GridwiseGemmPipe = remove_cvref_t<decltype(
         GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>())>;
 
-    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
         // A matrix in LDS memory, dst of blockwise copy
         return make_naive_tensor_descriptor(
-            make_tuple(AK0, Number<MPerBlock>{}, AK1),
-            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
+            make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1Number, AK1Number, I1));
     }
 
-    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
     {
         // B matrix in LDS memory, dst of blockwise copy
         return make_naive_tensor_descriptor(
-            make_tuple(BK0, Number<NPerBlock>{}, BK1),
-            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
+            make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
+            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1Number, BK1Number, I1));
     }
 
-    __host__ __device__ static constexpr auto
-    GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    __device__ static constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
     {
         constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
         constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
@@ -172,14 +519,14 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
     }
 
-    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
     {
         // LDS allocation for A and B: be careful of alignment
         constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
         constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
 
         // lds max alignment
-        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
 
         constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
             a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
@@ -200,36 +547,102 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
     }
 
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
-    template <typename Block2CTileMap>
-    __host__ __device__ static constexpr bool
-    CheckValidity(const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
-                  const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
-                  const CGridDesc_M_N& c_grid_desc_m_n,
-                  const Block2CTileMap& block_2_ctile_map)
+    __host__ static constexpr bool CheckValidity(const Problem& problem)
     {
         static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
                           (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
                       "Invalid tuning param!");
 
-        const auto M = a_grid_desc_ak0_m_ak1.GetLength(I1);
-        const auto N = b_grid_desc_bk0_n_bk1.GetLength(I1);
-        const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2);
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(problem.M % MPerBlock == 0))
+            {
+                return false;
+            }
+        }
 
-        if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1)))
-            return false;
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(problem.N % NPerBlock == 0))
+            {
+                return false;
+            }
+        }
 
-        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
-            return false;
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding)
+        {
+            if(!(CalculateKPadded(problem.K) % AK1Value == 0) ||
+               !(CalculateKPadded(problem.K) % BK1Value == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(!(problem.K % AK1Value == 0) || !(problem.K % BK1Value == 0))
+            {
+                return false;
+            }
+        }
 
-        // check gridwise gemm pipeline
-        const auto num_k_loop = K / KPerBlock;
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(problem.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(problem.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
 
-        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
         {
-            return false;
+            if(problem.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(problem.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
         }
 
-        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(problem.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(problem.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                return false;
+            }
+        }
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = (CalculateAK0(problem.K) * AK1Value) / KPerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
         {
             return false;
         }
@@ -238,22 +651,17 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         return true;
     }
 
-    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
     {
         const index_t num_loop = K / KPerBlock;
 
         return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
     }
 
-    __host__ __device__ static constexpr auto
-    MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N& c_grid_desc_m_n)
+    template <typename CGridDesc>
+    __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
     {
-        const auto M = c_grid_desc_m_n.GetLength(I0);
-        const auto N = c_grid_desc_m_n.GetLength(I1);
-
-        const auto MBlock = M / MPerBlock;
-        const auto NBlock = N / NPerBlock;
-
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
             c_grid_desc_m_n,
             make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
@@ -265,33 +673,26 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
     }
 
     // return block_id to C matrix tile idx (m0, n0) mapping
-    __host__ __device__ static constexpr auto
-    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n)
-    {
-        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
-            c_grid_desc_m_n);
-    }
-
-    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
-        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}))>;
-
-    using DefaultBlock2CTileMap =
-        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}))>;
+    using Block2CTileMap = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
 
-    template <bool HasMainKBlockLoop, typename Block2CTileMap>
+    template <bool HasMainKBlockLoop>
     __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
                                const FloatAB* __restrict__ p_b_grid,
                                FloatC* __restrict__ p_c_grid,
                                void* __restrict__ p_shared,
-                               const AElementwiseOperation& a_element_op,
-                               const BElementwiseOperation& b_element_op,
-                               const CElementwiseOperation& c_element_op,
-                               const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
-                               const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
-                               const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
-                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
-                               const Block2CTileMap& block_2_ctile_map)
+                               const Problem& problem)
     {
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -299,7 +700,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
 
+        const AElementwiseOperation a_element_op{};
+        const BElementwiseOperation b_element_op{};
+        const CElementwiseOperation c_element_op{};
+
         // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N};
+
         const auto block_work_idx =
             block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
 
@@ -319,7 +726,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
             __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
 
         // lds max alignment
-        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
 
         // A matrix in LDS memory, dst of blockwise copy
         constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
@@ -333,7 +740,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                                                 AElementwiseOperation,
                                                 ck::tensor_operation::element_wise::PassThrough,
                                                 InMemoryDataOperationEnum::Set,
-                                                Sequence<AK0, MPerBlock, AK1>,
+                                                Sequence<AK0Number, MPerBlock, AK1Number>,
                                                 ABlockTransferThreadClusterLengths_AK0_M_AK1,
                                                 ABlockTransferThreadClusterArrangeOrder,
                                                 FloatAB,
@@ -364,7 +771,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                                                 BElementwiseOperation,
                                                 ck::tensor_operation::element_wise::PassThrough,
                                                 InMemoryDataOperationEnum::Set,
-                                                Sequence<BK0, NPerBlock, BK1>,
+                                                Sequence<BK0Number, NPerBlock, BK1Number>,
                                                 BBlockTransferThreadClusterLengths_BK0_N_BK1,
                                                 BBlockTransferThreadClusterArrangeOrder,
                                                 FloatAB,
@@ -396,8 +803,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
         //       register
         // sanity check
-        constexpr index_t KPack = math::max(
-            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        constexpr index_t KPack =
+            math::max(math::lcm(AK1Number, BK1Number),
+                      MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
@@ -425,8 +833,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
             static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
             b_block_desc_bk0_n_bk1.GetElementSpaceSize());
 
-        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
-        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
 
         // gridwise GEMM pipeline
         static_assert(std::is_default_constructible_v<GridwiseGemmPipe>);
-- 
GitLab


From 6eef0755c923c8dbf7860e31fb8c2e1b8859bc6e Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Tue, 30 May 2023 20:18:53 +0800
Subject: [PATCH 46/71] fix wmma gemm int8; add grouped conv int8 example
 (#716)

---
 .../CMakeLists.txt                            |  1 +
 ...ouped_conv_fwd_bias_relu_add_wmma_int8.cpp | 26 +++++++++++++++++++
 .../tensor_operation/gpu/warp/wmma_gemm.hpp   |  8 +++---
 3 files changed, 31 insertions(+), 4 deletions(-)
 create mode 100644 example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_int8.cpp

diff --git a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
index 047299685..9780a64cc 100644
--- a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
@@ -20,4 +20,5 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
 endif()
 if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
   add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp)
+  add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_int8 grouped_conv_fwd_bias_relu_add_wmma_int8.cpp)
 endif()
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_int8.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_int8.cpp
new file mode 100644
index 000000000..793324970
--- /dev/null
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_int8.cpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common_wmma.hpp"
+
+// kernel data types
+using InKernelDataType       = I8;
+using WeiKernelDataType      = I8;
+using AccDataType            = I32;
+using CShuffleDataType       = I8;
+using BiasKernelDataType     = I8;
+using ResidualKernelDataType = I8;
+using OutKernelDataType      = I8;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+
+#include "run_grouped_conv_fwd_bias_relu_add_wmma_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
index 0672bf8e5..24efeb2de 100644
--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -262,12 +262,12 @@ struct wmma_type<WmmaInstr::wmma_i32_16x16x16_iu8,
 
     template <index_t MPerWmma,
               index_t NPerWmma,
-              bool neg_a,
-              bool neg_b,
-              bool clamp,
               class FloatA,
               class FloatB,
-              class FloatC>
+              class FloatC,
+              bool neg_a = false,
+              bool neg_b = false,
+              bool clamp = false>
     __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
     {
         if constexpr(wave_size == 32)
-- 
GitLab


From 582e31e88d185f7bb8b24086a0d8d3804d17501f Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Wed, 31 May 2023 23:25:25 +0800
Subject: [PATCH 47/71] Add class type support for
 __builtin_amdgcn_readfirstlane() (#711)

* Add overloaded version of __builtin_amdgcn_readfirstlane()

* Remove 'static' specifiers

* Remove more 'static' specifier

* Replace unsigne char by std::byte

* Add 'const' specifier to never changing variable

* Add 'inline' specifier to funcion definition

* Fix wrong boundar calculation logic

* Rename type trait

* Remove std:: qualifier from standard types

* Replace 'size_t' by 'unsigned'

* Use type alias to hint usage

* Replace static_for<> by ordinary 'for' loop

* Rename readfirstlane() to amd_wave_read_first_lane()

* Rename file readfirstlance.hpp as amd_wave_read_first_lane.hpp

* Reorder statements
---
 .../ck/utility/amd_wave_read_first_lane.hpp   | 83 +++++++++++++++++++
 include/ck/utility/common_header.hpp          |  1 +
 2 files changed, 84 insertions(+)
 create mode 100644 include/ck/utility/amd_wave_read_first_lane.hpp

diff --git a/include/ck/utility/amd_wave_read_first_lane.hpp b/include/ck/utility/amd_wave_read_first_lane.hpp
new file mode 100644
index 000000000..4652ce7a7
--- /dev/null
+++ b/include/ck/utility/amd_wave_read_first_lane.hpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/functional2.hpp"
+#include "ck/utility/math.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace ck {
+namespace detail {
+
+template <unsigned Size>
+struct get_unsigned_int;
+
+template <>
+struct get_unsigned_int<1>
+{
+    using type = uint8_t;
+};
+
+template <>
+struct get_unsigned_int<2>
+{
+    using type = uint16_t;
+};
+
+template <>
+struct get_unsigned_int<4>
+{
+    using type = uint32_t;
+};
+
+template <unsigned Size>
+using get_unsigned_int_t = typename get_unsigned_int<Size>::type;
+
+} // namespace detail
+
+__device__ inline int32_t amd_wave_read_first_lane(int32_t value)
+{
+    return __builtin_amdgcn_readfirstlane(value);
+}
+
+template <
+    typename Object,
+    typename = std::enable_if_t<std::is_class_v<Object> && std::is_trivially_copyable_v<Object>>>
+__device__ auto amd_wave_read_first_lane(const Object& obj)
+{
+    using Size                = unsigned;
+    constexpr Size SgprSize   = 4;
+    constexpr Size ObjectSize = sizeof(Object);
+
+    auto* const from_obj = reinterpret_cast<const std::byte*>(&obj);
+    alignas(Object) std::byte to_obj[ObjectSize];
+
+    constexpr Size RemainedSize             = ObjectSize % SgprSize;
+    constexpr Size CompleteSgprCopyBoundary = ObjectSize - RemainedSize;
+    for(Size offset = 0; offset < CompleteSgprCopyBoundary; offset += SgprSize)
+    {
+        using Sgpr = detail::get_unsigned_int_t<SgprSize>;
+
+        *reinterpret_cast<Sgpr*>(to_obj + offset) =
+            amd_wave_read_first_lane(*reinterpret_cast<const Sgpr*>(from_obj + offset));
+    }
+
+    if constexpr(0 < RemainedSize)
+    {
+        using Carrier = detail::get_unsigned_int_t<RemainedSize>;
+
+        *reinterpret_cast<Carrier>(to_obj + CompleteSgprCopyBoundary) = amd_wave_read_first_lane(
+            *reinterpret_cast<const Carrier*>(from_obj + CompleteSgprCopyBoundary));
+    }
+
+    /// NOTE: Implicitly start object lifetime. It's better to use std::start_lifetime_at() in this
+    /// scenario
+    return *reinterpret_cast<Object*>(to_obj);
+}
+
+} // namespace ck
diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp
index 1378bbe44..8da87c876 100644
--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
@@ -33,6 +33,7 @@
 #include "ck/utility/debug.hpp"
 
 #include "ck/utility/amd_buffer_addressing.hpp"
+#include "ck/utility/amd_wave_read_first_lane.hpp"
 #include "ck/utility/generic_memory_space_atomic.hpp"
 #include "ck/utility/get_id.hpp"
 #include "ck/utility/thread_group.hpp"
-- 
GitLab


From b94fd0b2279c6476c6e109e99dc5d0e6d8ce313c Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 31 May 2023 16:46:57 -0700
Subject: [PATCH 48/71] update copyright headers (#726)

---
 CONTRIBUTORS.md                                                 | 2 +-
 LICENSE                                                         | 2 +-
 client_example/01_gemm/gemm.cpp                                 | 2 +-
 .../02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp          | 2 +-
 client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp   | 2 +-
 client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp       | 2 +-
 .../03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp          | 2 +-
 .../03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp   | 2 +-
 client_example/04_contraction/contraction_bilinear_fp32.cpp     | 2 +-
 client_example/04_contraction/contraction_bilinear_fp64.cpp     | 2 +-
 .../04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp        | 2 +-
 client_example/04_contraction/contraction_scale_fp32.cpp        | 2 +-
 client_example/04_contraction/contraction_scale_fp64.cpp        | 2 +-
 client_example/05_layernorm/layernorm2d.cpp                     | 2 +-
 client_example/06_softmax/softmax4d.cpp                         | 2 +-
 client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp     | 2 +-
 client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp     | 2 +-
 client_example/08_fused_attention/fused_attention.cpp           | 2 +-
 client_example/08_fused_attention/fused_attention_bias.cpp      | 2 +-
 .../conv2d_fwd_bias_relu_perchannel_quantization.cpp            | 2 +-
 .../conv2d_fwd_bias_relu_perlayer_quantization.cpp              | 2 +-
 .../conv2d_fwd_bias_tanh_perchannel_quantization.cpp            | 2 +-
 .../conv2d_fwd_bias_tanh_perlayer_quantization.cpp              | 2 +-
 .../09_quantization/conv2d_fwd_perchannel_quantization.cpp      | 2 +-
 .../09_quantization/conv2d_fwd_perlayer_quantization.cpp        | 2 +-
 client_example/09_quantization/gemm_quantization.cpp            | 2 +-
 .../10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp      | 2 +-
 client_example/11_grouped_conv_bwd_weight/common.hpp            | 2 +-
 .../12_elementwise_normalization/elementwise_layernorm2d.cpp    | 2 +-
 client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp              | 2 +-
 client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp              | 2 +-
 client_example/13_batchnorm/batchnorm_infer_nhwc.cpp            | 2 +-
 client_example/14_instance_id/batchnorm_fwd_instance_id.cpp     | 2 +-
 client_example/15_gemm_add_multiply/gemm_add_multiply.cpp       | 2 +-
 client_example/15_reduce/reduce_nhwc_c.cpp                      | 2 +-
 .../17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp          | 2 +-
 client_example/18_groupnorm/groupnorm_swish.cpp                 | 2 +-
 client_example/19_pool_fwd/avg_pool3d_fwd.cpp                   | 2 +-
 client_example/19_pool_fwd/max_pool2d_fwd.cpp                   | 2 +-
 example/01_gemm/common.hpp                                      | 2 +-
 example/01_gemm/gemm_dl_fp16.cpp                                | 2 +-
 example/01_gemm/gemm_dl_fp32.cpp                                | 2 +-
 example/01_gemm/gemm_dl_int4.cpp                                | 2 +-
 example/01_gemm/gemm_dl_int8.cpp                                | 2 +-
 example/01_gemm/gemm_wmma_fp16.cpp                              | 2 +-
 example/01_gemm/gemm_xdl_bf16.cpp                               | 2 +-
 example/01_gemm/gemm_xdl_fp16.cpp                               | 2 +-
 example/01_gemm/gemm_xdl_fp64.cpp                               | 2 +-
 example/01_gemm/gemm_xdl_int4.cpp                               | 2 +-
 example/01_gemm/gemm_xdl_int8.cpp                               | 2 +-
 example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp                    | 2 +-
 example/01_gemm/gemm_xdl_wavelet_fp16.cpp                       | 2 +-
 example/01_gemm/run_gemm_example.inc                            | 2 +-
 example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp            | 2 +-
 example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp             | 2 +-
 example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp           | 2 +-
 example/04_gemm_add_add_fastgelu/common.hpp                     | 2 +-
 .../04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp | 2 +-
 .../04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp | 2 +-
 .../04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp | 2 +-
 .../04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp | 2 +-
 .../04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp | 2 +-
 example/09_convnd_fwd/convnd_fwd_common.hpp                     | 2 +-
 example/09_convnd_fwd/convnd_fwd_dl_common.hpp                  | 2 +-
 example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp                    | 2 +-
 example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp                    | 2 +-
 example/09_convnd_fwd/convnd_fwd_dl_int8.cpp                    | 2 +-
 example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp                   | 2 +-
 example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp                   | 2 +-
 example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp                   | 2 +-
 example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp                   | 2 +-
 example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp                   | 2 +-
 example/09_convnd_fwd/run_convnd_fwd_dl_example.inc             | 2 +-
 example/09_convnd_fwd/run_convnd_fwd_example.inc                | 2 +-
 example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp     | 2 +-
 .../convnd_fwd_max_xdl_bf16.cpp                                 | 2 +-
 .../convnd_fwd_max_xdl_fp16.cpp                                 | 2 +-
 .../convnd_fwd_max_xdl_fp32.cpp                                 | 2 +-
 .../convnd_fwd_max_xdl_int4.cpp                                 | 2 +-
 .../convnd_fwd_max_xdl_int8.cpp                                 | 2 +-
 .../run_convnd_fwd_max_example.inc                              | 2 +-
 example/12_reduce/reduce_blockwise.cpp                          | 2 +-
 example/12_reduce/reduce_blockwise_impl.hpp                     | 2 +-
 example/12_reduce/reduce_blockwise_two_call.cpp                 | 2 +-
 example/12_reduce/reduce_example_common.hpp                     | 2 +-
 example/12_reduce/reduce_multiblock_atomic_add.cpp              | 2 +-
 example/12_reduce/reduce_multiblock_atomic_add_impl.hpp         | 2 +-
 example/13_pool2d_fwd/pool2d_fwd_common.hpp                     | 2 +-
 example/13_pool2d_fwd/pool2d_fwd_fp16.cpp                       | 2 +-
 example/13_pool2d_fwd/pool2d_fwd_fp32.cpp                       | 2 +-
 example/14_gemm_quantization/gemm_dl_quantization_int8.cpp      | 2 +-
 .../gemm_xdl_bias_relu_quantization_int8.cpp                    | 2 +-
 example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp     | 2 +-
 example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp     | 2 +-
 example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp              | 2 +-
 example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp               | 2 +-
 example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp               | 2 +-
 example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp               | 2 +-
 example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp               | 2 +-
 example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp        | 2 +-
 .../gemm_add_add_mean_meansquare_xdl_fp16.cpp                   | 2 +-
 .../gemm_add_addsquare_xdl_int8.cpp                             | 2 +-
 example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp     | 2 +-
 example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp     | 2 +-
 example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp     | 2 +-
 example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp     | 2 +-
 example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp     | 2 +-
 .../gemm_mean_meansquare_xdl_bf16.cpp                           | 2 +-
 .../gemm_mean_meansquare_xdl_fp16.cpp                           | 2 +-
 .../gemm_mean_meansquare_xdl_fp32.cpp                           | 2 +-
 .../16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp    | 2 +-
 example/17_convnd_bwd_data/convnd_bwd_data_common.hpp           | 2 +-
 example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp          | 2 +-
 example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp         | 2 +-
 example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp | 2 +-
 example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp       | 2 +-
 example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp      | 2 +-
 example/19_binary_elementwise/elementwise_add_1d.cpp            | 2 +-
 example/19_binary_elementwise/elementwise_add_4d.cpp            | 2 +-
 example/20_grouped_conv_bwd_weight/common.hpp                   | 2 +-
 .../grouped_conv_bwd_weight_xdl_bf16.cpp                        | 2 +-
 .../grouped_conv_bwd_weight_xdl_fp16.cpp                        | 2 +-
 .../run_grouped_conv_bwd_weight_example.inc                     | 2 +-
 .../gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp             | 2 +-
 .../gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp           | 2 +-
 example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp     | 2 +-
 .../gemm_xdl_layernorm_naive_single_kernel_fp16.cpp             | 2 +-
 example/22_cgemm/cgemm_xdl_bf16.cpp                             | 2 +-
 example/22_cgemm/cgemm_xdl_common.hpp                           | 2 +-
 example/22_cgemm/cgemm_xdl_fp16.cpp                             | 2 +-
 example/22_cgemm/cgemm_xdl_fp32.cpp                             | 2 +-
 example/22_cgemm/cgemm_xdl_int4.cpp                             | 2 +-
 example/22_cgemm/cgemm_xdl_int8.cpp                             | 2 +-
 example/23_softmax/softmax_blockwise.cpp                        | 2 +-
 .../gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp                   | 2 +-
 .../gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp                   | 2 +-
 example/26_contraction/contraction_bilinear_xdl_fp32.cpp        | 2 +-
 example/26_contraction/contraction_bilinear_xdl_fp64.cpp        | 2 +-
 example/26_contraction/contraction_scale_xdl_fp32.cpp           | 2 +-
 example/26_contraction/contraction_scale_xdl_fp64.cpp           | 2 +-
 example/27_layernorm/common.hpp                                 | 2 +-
 example/27_layernorm/layernorm_fp16.cpp                         | 2 +-
 example/27_layernorm/layernorm_splitk_fp16.cpp                  | 2 +-
 example/27_layernorm/run_layernorm_example.inc                  | 2 +-
 .../grouped_gemm_bias_e_permute_xdl_fp16.cpp                    | 2 +-
 .../batched_gemm_bias_e_permute_wmma_fp16.cpp                   | 2 +-
 .../batched_gemm_bias_e_permute_xdl_fp16.cpp                    | 2 +-
 example/30_grouped_conv_fwd_multiple_d/common.hpp               | 2 +-
 example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp          | 2 +-
 .../grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp                | 2 +-
 .../grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp                 | 2 +-
 .../grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp                 | 2 +-
 .../grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp                 | 2 +-
 .../grouped_conv_fwd_bias_relu_add_xdl_int4.cpp                 | 2 +-
 .../grouped_conv_fwd_bias_relu_add_xdl_int8.cpp                 | 2 +-
 .../run_grouped_conv_fwd_bias_relu_add_example.inc              | 2 +-
 .../run_grouped_conv_fwd_bias_relu_add_wmma_example.inc         | 2 +-
 .../run_grouped_conv_fwd_example.inc                            | 2 +-
 example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp     | 2 +-
 example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp     | 2 +-
 example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp     | 2 +-
 example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp     | 2 +-
 example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp     | 2 +-
 example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc  | 2 +-
 ..._gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp | 2 +-
 .../batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp        | 2 +-
 .../batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp        | 2 +-
 .../batched_gemm_scale_softmax_gemm_xdl_bf16.cpp                | 2 +-
 .../batched_gemm_scale_softmax_gemm_xdl_fp16.cpp                | 2 +-
 ..._gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp | 2 +-
 .../grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp        | 2 +-
 .../run_batched_gemm_scale_softmax_gemm.inc                     | 2 +-
 .../run_batched_gemm_scale_softmax_gemm_permute.inc             | 2 +-
 .../run_grouped_gemm_scale_softmax_gemm_permute.inc             | 2 +-
 example/33_multiple_reduce/dual_reduce_common.hpp               | 2 +-
 example/33_multiple_reduce/dual_reduce_multiblock.cpp           | 2 +-
 example/33_multiple_reduce/dual_reduce_threadwise.cpp           | 2 +-
 example/34_batchnorm/batchnorm_backward_nhwc.cpp                | 2 +-
 example/34_batchnorm/batchnorm_common.hpp                       | 2 +-
 example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp       | 2 +-
 example/34_batchnorm/batchnorm_forward_training_nhwc.cpp        | 2 +-
 example/34_batchnorm/batchnorm_infer_impl.hpp                   | 2 +-
 example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp                | 2 +-
 example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp                 | 2 +-
 example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp                 | 2 +-
 example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp                 | 2 +-
 example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp                 | 2 +-
 .../36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp | 2 +-
 .../batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp             | 2 +-
 example/38_grouped_conv_bwd_data_multiple_d/common.hpp          | 2 +-
 .../grouped_conv_bwd_data_bias_relu_fp16.cpp                    | 2 +-
 .../grouped_conv_bwd_data_fp16.cpp                              | 2 +-
 .../run_grouped_conv_bwd_data_bias_relu_example.inc             | 2 +-
 .../run_grouped_conv_bwd_data_example.inc                       | 2 +-
 example/39_permute/common.hpp                                   | 2 +-
 example/39_permute/permute_1xHxW_fp16.cpp                       | 2 +-
 example/39_permute/permute_HxWx4_fp16.cpp                       | 2 +-
 example/39_permute/permute_NxHxW_fp16.cpp                       | 2 +-
 example/39_permute/run_permute_bundle_example.inc               | 2 +-
 example/39_permute/run_permute_element_example.inc              | 2 +-
 example/40_conv2d_fwd_quantization/common.hpp                   | 2 +-
 .../conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp    | 2 +-
 .../conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp      | 2 +-
 .../conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp    | 2 +-
 .../conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp      | 2 +-
 .../conv2d_fwd_dl_perchannel_quantization_int8.cpp              | 2 +-
 .../conv2d_fwd_dl_perlayer_quantization_int8.cpp                | 2 +-
 .../conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp   | 2 +-
 .../conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp     | 2 +-
 .../conv2d_fwd_xdl_perchannel_quantization_int8.cpp             | 2 +-
 .../conv2d_fwd_xdl_perlayer_quantization_int8.cpp               | 2 +-
 .../run_conv2d_fwd_bias_perchannel_quantization_example.inc     | 2 +-
 .../run_conv2d_fwd_bias_perlayer_quantization_example.inc       | 2 +-
 .../run_conv2d_fwd_perchannel_quantization_example.inc          | 2 +-
 .../run_conv2d_fwd_perlayer_quantization_example.inc            | 2 +-
 .../41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp | 2 +-
 .../41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp | 2 +-
 .../41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp | 2 +-
 .../41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp | 2 +-
 .../41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp | 2 +-
 .../run_grouped_conv_conv_fwd_example.inc                       | 2 +-
 example/42_groupnorm/common.hpp                                 | 2 +-
 example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp             | 2 +-
 example/42_groupnorm/groupnorm_splitk_fp16.cpp                  | 2 +-
 example/42_groupnorm/groupnorm_swish_fp16.cpp                   | 2 +-
 example/42_groupnorm/run_groupnorm_example.inc                  | 2 +-
 .../splitk_gemm_bias_e_permute_xdl_fp16.cpp                     | 2 +-
 .../splitk_gemm_bias_e_permute_xdl_fp32.cpp                     | 2 +-
 .../elementwise_layernorm_blockwise.cpp                         | 2 +-
 example/46_gemm_add_multiply/common.hpp                         | 2 +-
 example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp      | 2 +-
 example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp     | 2 +-
 .../gemm_bias_softmax_gemm_permute.cpp                          | 2 +-
 example/48_pool3d_fwd/pool3d_fwd_common.hpp                     | 2 +-
 example/48_pool3d_fwd/pool3d_fwd_fp16.cpp                       | 2 +-
 include/ck/ck.hpp                                               | 2 +-
 include/ck/host_utility/device_prop.hpp                         | 2 +-
 include/ck/host_utility/hip_check_error.hpp                     | 2 +-
 include/ck/host_utility/io.hpp                                  | 2 +-
 include/ck/host_utility/kernel_launch.hpp                       | 2 +-
 ...forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp | 2 +-
 include/ck/stream_config.hpp                                    | 2 +-
 include/ck/tensor/static_tensor.hpp                             | 2 +-
 include/ck/tensor_description/cluster_descriptor.hpp            | 2 +-
 include/ck/tensor_description/multi_index_transform.hpp         | 2 +-
 include/ck/tensor_description/multi_index_transform_helper.hpp  | 2 +-
 include/ck/tensor_description/tensor_adaptor.hpp                | 2 +-
 include/ck/tensor_description/tensor_descriptor.hpp             | 2 +-
 include/ck/tensor_description/tensor_descriptor_helper.hpp      | 2 +-
 include/ck/tensor_description/tensor_space_filling_curve.hpp    | 2 +-
 .../ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp    | 2 +-
 .../ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp | 2 +-
 .../ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp   | 2 +-
 include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp   | 2 +-
 include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp | 2 +-
 .../gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp              | 2 +-
 include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp     | 2 +-
 .../gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp          | 2 +-
 include/ck/tensor_operation/gpu/block/blockwise_welford.hpp     | 2 +-
 .../gpu/block/reduction_functions_blockwise.hpp                 | 2 +-
 .../gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp       | 2 +-
 .../gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp       | 2 +-
 .../gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp       | 2 +-
 .../gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp       | 2 +-
 .../gpu/block/thread_group_tensor_slice_transfer_v7.hpp         | 2 +-
 .../gpu/device/convolution_backward_data_specialization.hpp     | 2 +-
 .../gpu/device/convolution_backward_weight_specialization.hpp   | 2 +-
 .../gpu/device/convolution_forward_specialization.hpp           | 2 +-
 include/ck/tensor_operation/gpu/device/device_base.hpp          | 2 +-
 .../gpu/device/device_batched_contraction_multiple_d.hpp        | 2 +-
 include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp  | 2 +-
 .../ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp | 2 +-
 .../tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp | 2 +-
 .../device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp   | 2 +-
 .../gpu/device/device_batched_gemm_softmax_gemm.hpp             | 2 +-
 .../gpu/device/device_batched_gemm_softmax_gemm_permute.hpp     | 2 +-
 .../tensor_operation/gpu/device/device_batchnorm_backward.hpp   | 2 +-
 .../ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp | 2 +-
 .../ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp   | 2 +-
 include/ck/tensor_operation/gpu/device/device_cgemm.hpp         | 2 +-
 .../gpu/device/device_contraction_multiple_d.hpp                | 2 +-
 include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp | 2 +-
 include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp      | 2 +-
 .../gpu/device/device_conv_fwd_bias_activation.hpp              | 2 +-
 .../gpu/device/device_conv_fwd_bias_activation_add.hpp          | 2 +-
 include/ck/tensor_operation/gpu/device/device_elementwise.hpp   | 2 +-
 .../gpu/device/device_elementwise_normalization.hpp             | 2 +-
 include/ck/tensor_operation/gpu/device/device_gemm.hpp          | 2 +-
 .../tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp  | 2 +-
 .../ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp   | 2 +-
 .../gpu/device/device_gemm_multiple_d_layernorm.hpp             | 2 +-
 .../gpu/device/device_gemm_multiple_d_multiple_r.hpp            | 2 +-
 include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp   | 2 +-
 include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp   | 2 +-
 .../gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp        | 2 +-
 .../gpu/device/device_grouped_contraction_multiple_d.hpp        | 2 +-
 .../gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp      | 2 +-
 .../gpu/device/device_grouped_conv_bwd_weight.hpp               | 2 +-
 .../ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp  | 2 +-
 .../device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp    | 2 +-
 .../gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp    | 2 +-
 .../gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp     | 2 +-
 .../device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp   | 2 +-
 .../ck/tensor_operation/gpu/device/device_multiple_reduce.hpp   | 2 +-
 include/ck/tensor_operation/gpu/device/device_normalization.hpp | 2 +-
 include/ck/tensor_operation/gpu/device/device_permute.hpp       | 2 +-
 include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp      | 2 +-
 include/ck/tensor_operation/gpu/device/device_reduce.hpp        | 2 +-
 include/ck/tensor_operation/gpu/device/device_softmax.hpp       | 2 +-
 .../gpu/device/device_splitk_contraction_multiple_d.hpp         | 2 +-
 .../device_splitk_contraction_multiple_d_xdl_cshuffle.hpp       | 2 +-
 include/ck/tensor_operation/gpu/device/gemm_specialization.hpp  | 2 +-
 .../device_batched_contraction_multiple_d_wmma_cshuffle.hpp     | 2 +-
 .../impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp | 2 +-
 .../gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp   | 2 +-
 .../gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp         | 2 +-
 ...ice_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp | 2 +-
 .../gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp | 2 +-
 .../device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp   | 2 +-
 .../impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp      | 2 +-
 .../gpu/device/impl/device_batched_gemm_xdl.hpp                 | 2 +-
 .../gpu/device/impl/device_batchnorm_backward_impl.hpp          | 2 +-
 .../gpu/device/impl/device_batchnorm_forward_impl.hpp           | 2 +-
 .../gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp         | 2 +-
 .../device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp  | 2 +-
 ...vice_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp | 2 +-
 .../device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp   | 2 +-
 ...v2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp | 2 +-
 ..._conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp | 2 +-
 .../impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp     | 2 +-
 .../gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp    | 2 +-
 .../device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp   | 2 +-
 .../gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp | 2 +-
 .../gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp   | 2 +-
 .../gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp  | 2 +-
 .../gpu/device/impl/device_elementwise_2d_impl.hpp              | 2 +-
 .../gpu/device/impl/device_elementwise_impl.hpp                 | 2 +-
 .../gpu/device/impl/device_elementwise_normalization_impl.hpp   | 2 +-
 .../device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp    | 2 +-
 include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp  | 2 +-
 .../gpu/device/impl/device_gemm_multiple_d_dl.hpp               | 2 +-
 .../impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp      | 2 +-
 .../impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp     | 2 +-
 .../gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp    | 2 +-
 .../gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp     | 2 +-
 .../gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp         | 2 +-
 .../ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp    | 2 +-
 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp | 2 +-
 .../gpu/device/impl/device_gemm_xdl_cshuffle.hpp                | 2 +-
 .../gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp      | 2 +-
 .../gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp              | 2 +-
 .../gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp        | 2 +-
 .../impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp | 2 +-
 .../device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 2 +-
 ...vice_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp | 2 +-
 .../impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp      | 2 +-
 ...vice_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp | 2 +-
 .../impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp   | 2 +-
 .../gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp       | 2 +-
 .../gpu/device/impl/device_grouped_gemm_xdl.hpp                 | 2 +-
 .../gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp | 2 +-
 .../gpu/device/impl/device_multiple_reduce_multiblock.hpp       | 2 +-
 .../gpu/device/impl/device_multiple_reduce_threadwise.hpp       | 2 +-
 .../gpu/device/impl/device_normalization_impl.hpp               | 2 +-
 .../gpu/device/impl/device_normalization_splitk_impl.hpp        | 2 +-
 .../ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp | 2 +-
 .../gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp             | 2 +-
 .../gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp           | 2 +-
 .../tensor_operation/gpu/device/impl/device_reduce_common.hpp   | 2 +-
 .../gpu/device/impl/device_reduce_multiblock.hpp                | 2 +-
 .../gpu/device/impl/device_reduce_threadwise.hpp                | 2 +-
 .../ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp | 2 +-
 .../device/impl/device_sparse_embeddings_forward_layernorm.hpp  | 2 +-
 .../ck/tensor_operation/gpu/device/masking_specialization.hpp   | 2 +-
 include/ck/tensor_operation/gpu/device/matrix_padder.hpp        | 2 +-
 .../tensor_operation/gpu/device/reduction_operator_mapping.hpp  | 2 +-
 include/ck/tensor_operation/gpu/device/tensor_layout.hpp        | 2 +-
 .../ck/tensor_operation/gpu/device/tensor_specialization.hpp    | 2 +-
 include/ck/tensor_operation/gpu/device/welford_helper.hpp       | 2 +-
 .../gpu/element/binary_element_wise_operation.hpp               | 2 +-
 .../ck/tensor_operation/gpu/element/element_wise_operation.hpp  | 2 +-
 .../gpu/element/unary_element_wise_operation.hpp                | 2 +-
 ...e_multiblock_reduce_second_half_batchnorm_backward_final.hpp | 2 +-
 .../gridwise_multiblock_welford_first_half.hpp                  | 2 +-
 ...e_multiblock_welford_second_half_batchnorm_forward_final.hpp | 2 +-
 ...tiblock_welford_second_half_multiblock_reduce_first_half.hpp | 2 +-
 include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp     | 2 +-
 ...gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp | 2 +-
 .../gemm_layernorm/gridwise_welford_second_half_layernorm2d.hpp | 2 +-
 .../gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp      | 2 +-
 .../gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp      | 2 +-
 .../gpu/grid/gridwise_2d_reduction_multiblock.hpp               | 2 +-
 .../gpu/grid/gridwise_2d_reduction_threadwise.hpp               | 2 +-
 .../gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp     | 2 +-
 ..._batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp | 2 +-
 ...ise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp | 2 +-
 .../grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp | 2 +-
 .../gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp  | 2 +-
 .../gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp   | 2 +-
 .../ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp    | 2 +-
 .../ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp    | 2 +-
 .../grid/gridwise_elementwise_layernorm_welford_variance.hpp    | 2 +-
 .../gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp  | 2 +-
 .../tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp   | 2 +-
 .../grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp   | 2 +-
 .../gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp         | 2 +-
 .../gpu/grid/gridwise_gemm_pipeline_selector.hpp                | 2 +-
 .../ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp  | 2 +-
 .../gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp           | 2 +-
 .../gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp  | 2 +-
 .../ck/tensor_operation/gpu/grid/gridwise_gemm_waveletmodel.hpp | 2 +-
 include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp     | 2 +-
 .../tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp | 2 +-
 .../gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp        | 2 +-
 .../gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp        | 2 +-
 .../gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp             | 2 +-
 .../ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp  | 2 +-
 .../tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp   | 2 +-
 .../ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp  | 2 +-
 .../ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp  | 2 +-
 .../ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp  | 2 +-
 include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp       | 2 +-
 .../ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp  | 2 +-
 .../gpu/grid/gridwise_set_multiple_buffer_value.hpp             | 2 +-
 include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp       | 2 +-
 .../gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp   | 2 +-
 .../normalization/gridwise_normalization_naive_variance.hpp     | 2 +-
 .../gpu/grid/normalization/gridwise_normalization_selector.hpp  | 2 +-
 .../grid/normalization/gridwise_normalization_splitk_1st.hpp    | 2 +-
 .../grid/normalization/gridwise_normalization_splitk_2nd.hpp    | 2 +-
 .../normalization/gridwise_normalization_welford_variance.hpp   | 2 +-
 .../gpu/thread/reduction_functions_threadwise.hpp               | 2 +-
 .../tensor_operation/gpu/thread/threadwise_contraction_dl.hpp   | 2 +-
 .../ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp | 2 +-
 .../tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp | 2 +-
 .../gpu/thread/threadwise_tensor_slice_transfer.hpp             | 2 +-
 .../gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp        | 2 +-
 .../gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp        | 2 +-
 .../gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp        | 2 +-
 .../gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp        | 2 +-
 .../gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp        | 2 +-
 .../gpu/thread/threadwise_tensor_slice_transfer_v7.hpp          | 2 +-
 include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp   | 2 +-
 include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp              | 2 +-
 include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp            | 2 +-
 .../operator_transform/transform_contraction_to_gemm.hpp        | 2 +-
 .../operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp   | 2 +-
 .../operator_transform/transform_conv_fwd_to_gemm.hpp           | 2 +-
 include/ck/utility/amd_address_space.hpp                        | 2 +-
 include/ck/utility/amd_buffer_addressing.hpp                    | 2 +-
 include/ck/utility/amd_inline_asm.hpp                           | 2 +-
 include/ck/utility/amd_wmma.hpp                                 | 2 +-
 include/ck/utility/amd_xdlops.hpp                               | 2 +-
 include/ck/utility/array.hpp                                    | 2 +-
 include/ck/utility/array_multi_index.hpp                        | 2 +-
 include/ck/utility/c_style_pointer_cast.hpp                     | 2 +-
 include/ck/utility/common_header.hpp                            | 2 +-
 include/ck/utility/container_element_picker.hpp                 | 2 +-
 include/ck/utility/container_helper.hpp                         | 2 +-
 include/ck/utility/debug.hpp                                    | 2 +-
 include/ck/utility/dynamic_buffer.hpp                           | 2 +-
 include/ck/utility/enable_if.hpp                                | 2 +-
 include/ck/utility/functional.hpp                               | 2 +-
 include/ck/utility/functional2.hpp                              | 2 +-
 include/ck/utility/functional3.hpp                              | 2 +-
 include/ck/utility/functional4.hpp                              | 2 +-
 include/ck/utility/generic_memory_space_atomic.hpp              | 2 +-
 include/ck/utility/get_id.hpp                                   | 2 +-
 include/ck/utility/ignore.hpp                                   | 2 +-
 include/ck/utility/inner_product.hpp                            | 2 +-
 include/ck/utility/integral_constant.hpp                        | 2 +-
 include/ck/utility/is_known_at_compile_time.hpp                 | 2 +-
 include/ck/utility/magic_division.hpp                           | 2 +-
 include/ck/utility/math.hpp                                     | 2 +-
 include/ck/utility/math_v2.hpp                                  | 2 +-
 include/ck/utility/multi_index.hpp                              | 2 +-
 include/ck/utility/number.hpp                                   | 2 +-
 include/ck/utility/reduction_common.hpp                         | 2 +-
 include/ck/utility/reduction_enums.hpp                          | 2 +-
 include/ck/utility/reduction_functions_accumulate.hpp           | 2 +-
 include/ck/utility/reduction_operator.hpp                       | 2 +-
 include/ck/utility/sequence.hpp                                 | 2 +-
 include/ck/utility/sequence_helper.hpp                          | 2 +-
 include/ck/utility/span.hpp                                     | 2 +-
 include/ck/utility/static_buffer.hpp                            | 2 +-
 include/ck/utility/statically_indexed_array.hpp                 | 2 +-
 include/ck/utility/statically_indexed_array_multi_index.hpp     | 2 +-
 include/ck/utility/synchronization.hpp                          | 2 +-
 include/ck/utility/thread_group.hpp                             | 2 +-
 include/ck/utility/transpose_vectors.hpp                        | 2 +-
 include/ck/utility/tuple.hpp                                    | 2 +-
 include/ck/utility/tuple_helper.hpp                             | 2 +-
 include/ck/utility/type.hpp                                     | 2 +-
 .../reference_tensor_operation/cpu/reference_batched_gemm.hpp   | 2 +-
 .../cpu/reference_batchnorm_backward.hpp                        | 2 +-
 .../cpu/reference_batchnorm_forward.hpp                         | 2 +-
 .../cpu/reference_batchnorm_infer.hpp                           | 2 +-
 .../library/reference_tensor_operation/cpu/reference_cgemm.hpp  | 2 +-
 .../reference_tensor_operation/cpu/reference_conv_bwd_data.hpp  | 2 +-
 .../cpu/reference_conv_bwd_weight.hpp                           | 2 +-
 .../reference_tensor_operation/cpu/reference_conv_fwd.hpp       | 2 +-
 .../cpu/reference_conv_fwd_bias_activation.hpp                  | 2 +-
 .../cpu/reference_conv_fwd_bias_activation_add.hpp              | 2 +-
 .../library/reference_tensor_operation/cpu/reference_gemm.hpp   | 2 +-
 .../reference_tensor_operation/cpu/reference_gemm_layernorm.hpp | 2 +-
 .../reference_tensor_operation/cpu/reference_groupnorm.hpp      | 2 +-
 .../reference_tensor_operation/cpu/reference_layernorm.hpp      | 2 +-
 .../reference_tensor_operation/cpu/reference_pool_fwd.hpp       | 2 +-
 .../library/reference_tensor_operation/cpu/reference_reduce.hpp | 2 +-
 .../reference_tensor_operation/cpu/reference_softmax.hpp        | 2 +-
 .../cpu/reference_sparse_embedding3_forward_layernorm.hpp       | 2 +-
 .../library/reference_tensor_operation/gpu/naive_conv_fwd.hpp   | 2 +-
 .../tensor_operation_instance/add_device_operation_instance.hpp | 2 +-
 .../device_operation_instance_factory.hpp                       | 2 +-
 .../ck/library/tensor_operation_instance/gpu/batched_gemm.hpp   | 2 +-
 .../gpu/batched_gemm_add_relu_gemm_add.hpp                      | 2 +-
 .../tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp | 2 +-
 .../gpu/batched_gemm_bias_softmax_gemm_permute.hpp              | 2 +-
 .../library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp | 2 +-
 .../tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp | 2 +-
 .../gpu/batched_gemm_softmax_gemm_permute.hpp                   | 2 +-
 .../tensor_operation_instance/gpu/batchnorm_backward.hpp        | 2 +-
 .../library/tensor_operation_instance/gpu/batchnorm_forward.hpp | 2 +-
 .../library/tensor_operation_instance/gpu/batchnorm_infer.hpp   | 2 +-
 .../tensor_operation_instance/gpu/contraction_bilinear.hpp      | 2 +-
 .../library/tensor_operation_instance/gpu/contraction_scale.hpp | 2 +-
 .../tensor_operation_instance/gpu/convolution_backward_data.hpp | 2 +-
 .../tensor_operation_instance/gpu/convolution_forward.hpp       | 2 +-
 .../gpu/device_elementwise_instance.hpp                         | 2 +-
 .../gpu/device_gemm_mean_squaremean_instance.hpp                | 2 +-
 .../tensor_operation_instance/gpu/elementwise_normalization.hpp | 2 +-
 .../include/ck/library/tensor_operation_instance/gpu/gemm.hpp   | 2 +-
 .../tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp     | 2 +-
 .../library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp | 2 +-
 .../library/tensor_operation_instance/gpu/gemm_add_multiply.hpp | 2 +-
 .../gpu/gemm_add_relu_add_layernorm.hpp                         | 2 +-
 .../ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp  | 2 +-
 .../ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp  | 2 +-
 .../ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp    | 2 +-
 .../gpu/grouped_convolution_backward_data.hpp                   | 2 +-
 .../gpu/grouped_convolution_backward_weight.hpp                 | 2 +-
 .../gpu/grouped_convolution_forward.hpp                         | 2 +-
 .../ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp   | 2 +-
 .../ck/library/tensor_operation_instance/gpu/normalization.hpp  | 2 +-
 .../tensor_operation_instance/gpu/normalization_swish.hpp       | 2 +-
 .../ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp     | 2 +-
 .../ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp     | 2 +-
 .../gpu/quantization/gemm_quantization.hpp                      | 2 +-
 ...grouped_convolution_bias_forward_perchannel_quantization.hpp | 2 +-
 .../grouped_convolution_bias_forward_perlayer_quantization.hpp  | 2 +-
 .../grouped_convolution_forward_perchannel_quantization.hpp     | 2 +-
 .../grouped_convolution_forward_perlayer_quantization.hpp       | 2 +-
 .../gpu/reduce/device_reduce_instance.hpp                       | 2 +-
 .../gpu/reduce/device_reduce_instance_blockwise.hpp             | 2 +-
 .../reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp | 2 +-
 .../device_reduce_instance_blockwise_b16_f32_b16_amax.hpp       | 2 +-
 .../reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp | 2 +-
 .../device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp      | 2 +-
 .../device_reduce_instance_blockwise_f16_f16_f16_amax.hpp       | 2 +-
 .../reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp | 2 +-
 .../device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp      | 2 +-
 .../reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp | 2 +-
 .../device_reduce_instance_blockwise_f32_f32_f32_amax.hpp       | 2 +-
 .../reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp | 2 +-
 .../device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp      | 2 +-
 .../reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp | 2 +-
 .../device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp      | 2 +-
 .../reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp | 2 +-
 .../device_reduce_instance_blockwise_f64_f64_f64_amax.hpp       | 2 +-
 .../reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp | 2 +-
 .../device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp      | 2 +-
 .../reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp   | 2 +-
 .../reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp   | 2 +-
 .../reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp   | 2 +-
 .../reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp    | 2 +-
 .../reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp    | 2 +-
 .../gpu/reduce/device_reduce_instance_impl_common.hpp           | 2 +-
 .../gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp | 2 +-
 .../gpu/reduce/device_reduce_instance_threadwise.hpp            | 2 +-
 .../device_reduce_instance_threadwise_b16_f32_b16_add.hpp       | 2 +-
 .../device_reduce_instance_threadwise_b16_f32_b16_amax.hpp      | 2 +-
 .../device_reduce_instance_threadwise_b16_f32_b16_avg.hpp       | 2 +-
 .../device_reduce_instance_threadwise_b16_f32_b16_max.hpp       | 2 +-
 .../device_reduce_instance_threadwise_b16_f32_b16_min.hpp       | 2 +-
 .../device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp     | 2 +-
 .../device_reduce_instance_threadwise_f16_f16_f16_amax.hpp      | 2 +-
 .../device_reduce_instance_threadwise_f16_f16_f16_max.hpp       | 2 +-
 .../device_reduce_instance_threadwise_f16_f16_f16_min.hpp       | 2 +-
 .../device_reduce_instance_threadwise_f16_f32_f16_add.hpp       | 2 +-
 .../device_reduce_instance_threadwise_f16_f32_f16_avg.hpp       | 2 +-
 .../device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp     | 2 +-
 .../device_reduce_instance_threadwise_f32_f32_f32_add.hpp       | 2 +-
 .../device_reduce_instance_threadwise_f32_f32_f32_amax.hpp      | 2 +-
 .../device_reduce_instance_threadwise_f32_f32_f32_avg.hpp       | 2 +-
 .../device_reduce_instance_threadwise_f32_f32_f32_max.hpp       | 2 +-
 .../device_reduce_instance_threadwise_f32_f32_f32_min.hpp       | 2 +-
 .../device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp     | 2 +-
 .../device_reduce_instance_threadwise_f32_f64_f32_add.hpp       | 2 +-
 .../device_reduce_instance_threadwise_f32_f64_f32_avg.hpp       | 2 +-
 .../device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp     | 2 +-
 .../device_reduce_instance_threadwise_f64_f64_f64_add.hpp       | 2 +-
 .../device_reduce_instance_threadwise_f64_f64_f64_amax.hpp      | 2 +-
 .../device_reduce_instance_threadwise_f64_f64_f64_avg.hpp       | 2 +-
 .../device_reduce_instance_threadwise_f64_f64_f64_max.hpp       | 2 +-
 .../device_reduce_instance_threadwise_f64_f64_f64_min.hpp       | 2 +-
 .../device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp     | 2 +-
 .../reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp  | 2 +-
 .../reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp  | 2 +-
 .../reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp  | 2 +-
 .../reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp   | 2 +-
 .../reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp   | 2 +-
 .../ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp  | 2 +-
 .../ck/library/tensor_operation_instance/gpu/softmax.hpp        | 2 +-
 .../gpu/softmax/device_softmax_f16_f16_instance.hpp             | 2 +-
 .../softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp   | 2 +-
 .../softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp   | 2 +-
 .../softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp   | 2 +-
 .../softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp   | 2 +-
 .../softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp   | 2 +-
 .../softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp   | 2 +-
 .../softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp   | 2 +-
 .../gpu/softmax/device_softmax_f16_f16_instance_type.hpp        | 2 +-
 .../gpu/softmax/device_softmax_f32_f32_instance.hpp             | 2 +-
 .../softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp   | 2 +-
 .../softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp   | 2 +-
 .../softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp   | 2 +-
 .../softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp   | 2 +-
 .../softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp   | 2 +-
 .../softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp   | 2 +-
 .../softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp   | 2 +-
 .../gpu/softmax/device_softmax_f32_f32_instance_type.hpp        | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance.hpp               | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance_type.hpp          | 2 +-
 .../gpu/softmax/device_softmax_instance.hpp                     | 2 +-
 library/include/ck/library/utility/algorithm.hpp                | 2 +-
 library/include/ck/library/utility/check_err.hpp                | 2 +-
 library/include/ck/library/utility/conv_common.hpp              | 2 +-
 .../utility/convolution_host_tensor_descriptor_helper.hpp       | 2 +-
 library/include/ck/library/utility/convolution_parameter.hpp    | 2 +-
 library/include/ck/library/utility/device_memory.hpp            | 2 +-
 library/include/ck/library/utility/fill.hpp                     | 2 +-
 library/include/ck/library/utility/host_common_util.hpp         | 2 +-
 library/include/ck/library/utility/host_gemm.hpp                | 2 +-
 library/include/ck/library/utility/host_tensor.hpp              | 2 +-
 library/include/ck/library/utility/host_tensor_generator.hpp    | 2 +-
 library/include/ck/library/utility/iterator.hpp                 | 2 +-
 library/include/ck/library/utility/literals.hpp                 | 2 +-
 library/include/ck/library/utility/numeric.hpp                  | 2 +-
 library/include/ck/library/utility/ranges.hpp                   | 2 +-
 ...ice_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp | 2 +-
 ...ice_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp | 2 +-
 ...ice_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp | 2 +-
 ...ice_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp | 2 +-
 ...device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp | 2 +-
 ...device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp | 2 +-
 ...device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp | 2 +-
 ...device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp | 2 +-
 ...device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp | 2 +-
 ...device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp | 2 +-
 ...device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp | 2 +-
 ...device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp | 2 +-
 ...ice_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp | 2 +-
 ...ice_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp | 2 +-
 ...ice_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp | 2 +-
 ...ice_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp | 2 +-
 ...dd_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp | 2 +-
 ...dd_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp | 2 +-
 ..._permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp | 2 +-
 ...mm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp | 2 +-
 ...mm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp | 2 +-
 ...ce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp | 2 +-
 ...ce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp | 2 +-
 ...ce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp | 2 +-
 ...ce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp | 2 +-
 ...mm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp | 2 +-
 ...dl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp | 2 +-
 ...te_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp | 2 +-
 ...dl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp | 2 +-
 ...te_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp | 2 +-
 .../gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp   | 2 +-
 .../gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp    | 2 +-
 .../gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp    | 2 +-
 .../gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp    | 2 +-
 .../gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp    | 2 +-
 .../gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp     | 2 +-
 .../gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp     | 2 +-
 .../gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp     | 2 +-
 .../gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp      | 2 +-
 .../gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp       | 2 +-
 .../gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp       | 2 +-
 .../gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp       | 2 +-
 ...ear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp | 2 +-
 ...ear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp | 2 +-
 ...ear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp | 2 +-
 ...ear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp | 2 +-
 ...ear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp | 2 +-
 ...ear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp | 2 +-
 ...ear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp | 2 +-
 ...ear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp | 2 +-
 ...on_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp | 2 +-
 ...on_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp | 2 +-
 ...on_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp | 2 +-
 ...on_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp | 2 +-
 ...on_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp | 2 +-
 ...on_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp | 2 +-
 ...on_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp | 2 +-
 ...on_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp | 2 +-
 .../device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp    | 2 +-
 .../device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp     | 2 +-
 .../device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp     | 2 +-
 .../device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp    | 2 +-
 .../device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp   | 2 +-
 .../device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp   | 2 +-
 .../device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp  | 2 +-
 .../device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp | 2 +-
 .../device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp  | 2 +-
 .../device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp  | 2 +-
 .../device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp | 2 +-
 ...ice_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp | 2 +-
 .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp      | 2 +-
 .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp       | 2 +-
 .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp       | 2 +-
 .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp      | 2 +-
 ..._fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp | 2 +-
 ..._xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp | 2 +-
 ...vice_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp | 2 +-
 ...evice_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp | 2 +-
 ...evice_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp | 2 +-
 ...vice_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp | 2 +-
 .../gpu/elementwise/device_normalize_instance.cpp               | 2 +-
 .../device_elementwise_normalization_f16_instance.cpp           | 2 +-
 .../gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp   | 2 +-
 .../gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp   | 2 +-
 .../gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp   | 2 +-
 .../gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp   | 2 +-
 .../gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp   | 2 +-
 .../gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp   | 2 +-
 .../gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp   | 2 +-
 .../gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp   | 2 +-
 .../gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp      | 2 +-
 .../gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp      | 2 +-
 .../gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp      | 2 +-
 .../gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp      | 2 +-
 ...gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp | 2 +-
 ...vice_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp | 2 +-
 ...vice_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp | 2 +-
 ...vice_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp | 2 +-
 ...vice_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp | 2 +-
 .../device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp | 2 +-
 .../device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp | 2 +-
 .../device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp | 2 +-
 .../device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp | 2 +-
 .../device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp | 2 +-
 .../device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp | 2 +-
 .../device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp | 2 +-
 .../device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp | 2 +-
 .../device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp    | 2 +-
 .../device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp    | 2 +-
 .../device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp    | 2 +-
 .../device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp    | 2 +-
 .../gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp  | 2 +-
 .../gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp  | 2 +-
 .../gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp  | 2 +-
 .../gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp  | 2 +-
 .../gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp  | 2 +-
 .../gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp  | 2 +-
 .../gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp  | 2 +-
 .../gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp  | 2 +-
 .../gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp  | 2 +-
 .../gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp  | 2 +-
 .../gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp  | 2 +-
 .../gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp  | 2 +-
 ...dl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp | 2 +-
 ...dl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp | 2 +-
 ...dl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp | 2 +-
 ...dl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp | 2 +-
 ...tgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp | 2 +-
 ...tgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp | 2 +-
 ...tgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp | 2 +-
 ...tgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp | 2 +-
 ...dl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp | 2 +-
 ...dl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp | 2 +-
 ...dl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp | 2 +-
 ...dl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp | 2 +-
 ..._add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp | 2 +-
 ..._add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp | 2 +-
 ..._add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp | 2 +-
 ..._add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp | 2 +-
 ...emean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp | 2 +-
 ...emean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp | 2 +-
 ...emean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp | 2 +-
 ...emean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp | 2 +-
 ...inear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp | 2 +-
 ...inear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp | 2 +-
 ...inear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp | 2 +-
 ...inear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp | 2 +-
 ...emm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp | 2 +-
 ...emm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp | 2 +-
 ...emm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp | 2 +-
 ...emm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp | 2 +-
 ...educe_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp | 2 +-
 ...educe_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp | 2 +-
 ...educe_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp | 2 +-
 ...educe_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp | 2 +-
 .../device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp    | 2 +-
 .../device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp    | 2 +-
 .../device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp    | 2 +-
 .../device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp    | 2 +-
 .../device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp    | 2 +-
 .../device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp    | 2 +-
 .../device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp    | 2 +-
 .../device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp    | 2 +-
 ...ouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp | 2 +-
 ...rouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp | 2 +-
 ...rouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp | 2 +-
 ...vice_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp | 2 +-
 ...evice_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp | 2 +-
 ...evice_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp | 2 +-
 ...vice_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp | 2 +-
 ...ouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp | 2 +-
 ...ed_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp | 2 +-
 ...ped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp | 2 +-
 ...ped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp | 2 +-
 .../gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_common.hpp | 2 +-
 ...ice_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp | 2 +-
 ...ice_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp | 2 +-
 .../device_grouped_conv2d_fwd_dl_instance.hpp                   | 2 +-
 ...e_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp | 2 +-
 ...ce_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp | 2 +-
 ...ce_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp | 2 +-
 .../device_grouped_conv2d_fwd_xdl_instance.hpp                  | 2 +-
 ...e_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp | 2 +-
 ...ce_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp | 2 +-
 ...ce_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp | 2 +-
 ...conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp | 2 +-
 ..._conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp | 2 +-
 ..._conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp | 2 +-
 ...rouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp | 2 +-
 ...grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp | 2 +-
 ...grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp | 2 +-
 ...rouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp | 2 +-
 ...rouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp | 2 +-
 ...grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instance.cpp | 2 +-
 ...grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instance.cpp | 2 +-
 ...rouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp | 2 +-
 .../device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp   | 2 +-
 .../device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp   | 2 +-
 .../device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp   | 2 +-
 .../device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp   | 2 +-
 ...ce_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp | 2 +-
 ..._gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp | 2 +-
 ...ce_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp | 2 +-
 ..._gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp | 2 +-
 ..._grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp | 2 +-
 ..._grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp | 2 +-
 ..._grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp | 2 +-
 ..._grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp | 2 +-
 .../gpu/normalization/device_groupnorm_f16_instance.cpp         | 2 +-
 .../gpu/normalization/device_groupnorm_f32_instance.cpp         | 2 +-
 .../device_groupnorm_swish_f16_f32_f32_f16_instance.cpp         | 2 +-
 .../gpu/normalization/device_groupnorm_swish_f16_instance.cpp   | 2 +-
 .../gpu/normalization/device_groupnorm_swish_f32_instance.cpp   | 2 +-
 .../gpu/normalization/device_layernorm2d_f16_instance.cpp       | 2 +-
 .../gpu/normalization/device_layernorm2d_f32_instance.cpp       | 2 +-
 .../gpu/normalization/device_layernorm4d_f16_instance.cpp       | 2 +-
 .../gpu/normalization/device_layernorm4d_f32_instance.cpp       | 2 +-
 .../gpu/normalization/normalization_instance_common.hpp         | 2 +-
 .../gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp    | 2 +-
 .../gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp    | 2 +-
 .../gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp   | 2 +-
 .../gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp   | 2 +-
 .../gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp    | 2 +-
 .../gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp    | 2 +-
 .../gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp   | 2 +-
 .../gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp   | 2 +-
 .../gpu/pool_fwd/pool_fwd_instance_common.hpp                   | 2 +-
 .../gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp  | 2 +-
 ...ice_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp | 2 +-
 ...evice_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp | 2 +-
 .../quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp  | 2 +-
 .../device_conv2d_dl_perchannel_quantization_int8_instance.cpp  | 2 +-
 .../device_conv2d_dl_perlayer_quantization_int8_instance.cpp    | 2 +-
 ...ce_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp | 2 +-
 ...vice_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp | 2 +-
 .../quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp | 2 +-
 .../device_conv2d_xdl_perchannel_quantization_int8_instance.cpp | 2 +-
 .../device_conv2d_xdl_perlayer_quantization_int8_instance.cpp   | 2 +-
 .../device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp | 2 +-
 ...emm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp | 2 +-
 ...emm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp | 2 +-
 ...emm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp | 2 +-
 ...emm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp | 2 +-
 ...device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp | 2 +-
 ...mm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp | 2 +-
 ...mm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp | 2 +-
 ...mm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp | 2 +-
 ...mm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp | 2 +-
 .../gpu/quantization/gemm/gemm_quantization_common.hpp          | 2 +-
 .../reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp | 2 +-
 .../device_reduce_instance_blockwise_b16_f32_b16_amax.cpp       | 2 +-
 .../reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp | 2 +-
 .../device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp      | 2 +-
 .../device_reduce_instance_blockwise_f16_f16_f16_amax.cpp       | 2 +-
 .../reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp | 2 +-
 .../device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp      | 2 +-
 .../reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp | 2 +-
 .../device_reduce_instance_blockwise_f32_f32_f32_amax.cpp       | 2 +-
 .../reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp | 2 +-
 .../device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp      | 2 +-
 .../reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp | 2 +-
 .../device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp      | 2 +-
 .../reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp | 2 +-
 .../device_reduce_instance_blockwise_f64_f64_f64_amax.cpp       | 2 +-
 .../reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp | 2 +-
 .../reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp | 2 +-
 .../device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp      | 2 +-
 .../reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp   | 2 +-
 .../reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp   | 2 +-
 .../reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp   | 2 +-
 .../reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp    | 2 +-
 .../reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp    | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp | 2 +-
 ...ce_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp | 2 +-
 .../device_reduce_instance_threadwise_b16_f32_b16_add.cpp       | 2 +-
 .../device_reduce_instance_threadwise_b16_f32_b16_amax.cpp      | 2 +-
 .../device_reduce_instance_threadwise_b16_f32_b16_avg.cpp       | 2 +-
 .../device_reduce_instance_threadwise_b16_f32_b16_max.cpp       | 2 +-
 .../device_reduce_instance_threadwise_b16_f32_b16_min.cpp       | 2 +-
 .../device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp     | 2 +-
 .../device_reduce_instance_threadwise_f16_f16_f16_amax.cpp      | 2 +-
 .../device_reduce_instance_threadwise_f16_f16_f16_max.cpp       | 2 +-
 .../device_reduce_instance_threadwise_f16_f16_f16_min.cpp       | 2 +-
 .../device_reduce_instance_threadwise_f16_f32_f16_add.cpp       | 2 +-
 .../device_reduce_instance_threadwise_f16_f32_f16_avg.cpp       | 2 +-
 .../device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp     | 2 +-
 .../device_reduce_instance_threadwise_f32_f32_f32_add.cpp       | 2 +-
 .../device_reduce_instance_threadwise_f32_f32_f32_amax.cpp      | 2 +-
 .../device_reduce_instance_threadwise_f32_f32_f32_avg.cpp       | 2 +-
 .../device_reduce_instance_threadwise_f32_f32_f32_max.cpp       | 2 +-
 .../device_reduce_instance_threadwise_f32_f32_f32_min.cpp       | 2 +-
 .../device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp     | 2 +-
 .../device_reduce_instance_threadwise_f32_f64_f32_add.cpp       | 2 +-
 .../device_reduce_instance_threadwise_f32_f64_f32_avg.cpp       | 2 +-
 .../device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp     | 2 +-
 .../device_reduce_instance_threadwise_f64_f64_f64_add.cpp       | 2 +-
 .../device_reduce_instance_threadwise_f64_f64_f64_amax.cpp      | 2 +-
 .../device_reduce_instance_threadwise_f64_f64_f64_avg.cpp       | 2 +-
 .../device_reduce_instance_threadwise_f64_f64_f64_max.cpp       | 2 +-
 .../device_reduce_instance_threadwise_f64_f64_f64_min.cpp       | 2 +-
 .../device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp     | 2 +-
 .../reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp  | 2 +-
 .../reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp  | 2 +-
 .../reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp  | 2 +-
 .../reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp   | 2 +-
 .../reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp   | 2 +-
 .../gpu/softmax/device_softmax_f16_f16_instance.cpp             | 2 +-
 .../softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp   | 2 +-
 .../softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp   | 2 +-
 .../softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp   | 2 +-
 .../softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp   | 2 +-
 .../softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp   | 2 +-
 .../softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp   | 2 +-
 .../softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp   | 2 +-
 .../gpu/softmax/device_softmax_f32_f32_instance.cpp             | 2 +-
 .../softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp   | 2 +-
 .../softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp   | 2 +-
 .../softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp   | 2 +-
 .../softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp   | 2 +-
 .../softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp   | 2 +-
 .../softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp   | 2 +-
 .../softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp   | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance.cpp               | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp | 2 +-
 .../gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp | 2 +-
 library/src/utility/convolution_parameter.cpp                   | 2 +-
 library/src/utility/device_memory.cpp                           | 2 +-
 library/src/utility/host_tensor.cpp                             | 2 +-
 profiler/include/profiler/data_type_enum.hpp                    | 2 +-
 .../profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp    | 2 +-
 .../profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp     | 2 +-
 profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp    | 2 +-
 profiler/include/profiler/profile_batched_gemm_impl.hpp         | 2 +-
 profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp  | 2 +-
 .../include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp | 2 +-
 .../profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp | 2 +-
 profiler/include/profiler/profile_batchnorm_backward_impl.hpp   | 2 +-
 profiler/include/profiler/profile_batchnorm_forward_impl.hpp    | 2 +-
 profiler/include/profiler/profile_batchnorm_infer_impl.hpp      | 2 +-
 profiler/include/profiler/profile_conv_bwd_data_impl.hpp        | 2 +-
 .../include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp    | 2 +-
 profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp   | 2 +-
 profiler/include/profiler/profile_conv_fwd_impl.hpp             | 2 +-
 .../include/profiler/profile_elementwise_layernorm_impl.hpp     | 2 +-
 .../include/profiler/profile_gemm_add_add_fastgelu_impl.hpp     | 2 +-
 profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp    | 2 +-
 profiler/include/profiler/profile_gemm_add_multiply_impl.hpp    | 2 +-
 .../profiler/profile_gemm_add_relu_add_layernorm_impl.hpp       | 2 +-
 profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp | 2 +-
 profiler/include/profiler/profile_gemm_bilinear_impl.hpp        | 2 +-
 profiler/include/profiler/profile_gemm_fastgelu_impl.hpp        | 2 +-
 profiler/include/profiler/profile_gemm_impl.hpp                 | 2 +-
 profiler/include/profiler/profile_gemm_reduce_impl.hpp          | 2 +-
 profiler/include/profiler/profile_gemm_splitk_impl.hpp          | 2 +-
 .../include/profiler/profile_grouped_conv_bwd_weight_impl.hpp   | 2 +-
 profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp     | 2 +-
 .../include/profiler/profile_grouped_gemm_fastgelu_impl.hpp     | 2 +-
 profiler/include/profiler/profile_grouped_gemm_impl.hpp         | 2 +-
 profiler/include/profiler/profile_groupnorm_impl.hpp            | 2 +-
 profiler/include/profiler/profile_layernorm_impl.hpp            | 2 +-
 profiler/include/profiler/profile_pool2d_fwd_impl.hpp           | 2 +-
 profiler/include/profiler/profile_pool3d_fwd_impl.hpp           | 2 +-
 profiler/include/profiler/profile_reduce_impl.hpp               | 2 +-
 profiler/include/profiler/profile_softmax_impl.hpp              | 2 +-
 profiler/src/profile_avg_pool2d_fwd.cpp                         | 2 +-
 profiler/src/profile_batched_gemm.cpp                           | 2 +-
 profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp         | 2 +-
 profiler/src/profile_batched_gemm_gemm.cpp                      | 2 +-
 profiler/src/profile_batched_gemm_reduce.cpp                    | 2 +-
 profiler/src/profile_batchnorm_bwd.cpp                          | 2 +-
 profiler/src/profile_batchnorm_fwd.cpp                          | 2 +-
 profiler/src/profile_batchnorm_infer.cpp                        | 2 +-
 profiler/src/profile_conv_bwd_data.cpp                          | 2 +-
 profiler/src/profile_conv_fwd.cpp                               | 2 +-
 profiler/src/profile_conv_fwd_bias_relu.cpp                     | 2 +-
 profiler/src/profile_conv_fwd_bias_relu_add.cpp                 | 2 +-
 profiler/src/profile_gemm.cpp                                   | 2 +-
 profiler/src/profile_gemm_add_add_fastgelu.cpp                  | 2 +-
 profiler/src/profile_gemm_add_fastgelu.cpp                      | 2 +-
 profiler/src/profile_gemm_add_multiply.cpp                      | 2 +-
 profiler/src/profile_gemm_add_relu_add_layernorm.cpp            | 2 +-
 profiler/src/profile_gemm_bias_add_reduce.cpp                   | 2 +-
 profiler/src/profile_gemm_bilinear.cpp                          | 2 +-
 profiler/src/profile_gemm_fastgelu.cpp                          | 2 +-
 profiler/src/profile_gemm_reduce.cpp                            | 2 +-
 profiler/src/profile_gemm_splitk.cpp                            | 2 +-
 profiler/src/profile_grouped_conv_bwd_weight.cpp                | 2 +-
 profiler/src/profile_grouped_conv_fwd.cpp                       | 2 +-
 profiler/src/profile_grouped_gemm.cpp                           | 2 +-
 profiler/src/profile_grouped_gemm_fastgelu.cpp                  | 2 +-
 profiler/src/profile_groupnorm.cpp                              | 2 +-
 profiler/src/profile_layernorm.cpp                              | 2 +-
 profiler/src/profile_max_pool3d_fwd.cpp                         | 2 +-
 profiler/src/profile_reduce.cpp                                 | 2 +-
 profiler/src/profile_softmax.cpp                                | 2 +-
 profiler/src/profiler.cpp                                       | 2 +-
 profiler/src/profiler_operation_registry.hpp                    | 2 +-
 test/batched_gemm/batched_gemm_bf16.cpp                         | 2 +-
 test/batched_gemm/batched_gemm_fp16.cpp                         | 2 +-
 test/batched_gemm/batched_gemm_fp32.cpp                         | 2 +-
 test/batched_gemm/batched_gemm_int8.cpp                         | 2 +-
 test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp          | 2 +-
 test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp          | 2 +-
 test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp           | 2 +-
 .../test_batched_gemm_softmax_gemm_fp16.cpp                     | 2 +-
 .../test_batched_gemm_softmax_gemm_util.hpp                     | 2 +-
 .../test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp        | 2 +-
 .../test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp        | 2 +-
 .../test_batched_gemm_bias_softmax_gemm_permute_util.hpp        | 2 +-
 .../test_batched_gemm_softmax_gemm_permute_bf16.cpp             | 2 +-
 .../test_batched_gemm_softmax_gemm_permute_fp16.cpp             | 2 +-
 .../test_batched_gemm_softmax_gemm_permute_util.hpp             | 2 +-
 test/batchnorm/batchnorm_bwd_rank_4.cpp                         | 2 +-
 test/batchnorm/batchnorm_fwd_rank_4.cpp                         | 2 +-
 test/batchnorm/batchnorm_infer_rank_4.cpp                       | 2 +-
 test/block_to_ctile_map/test_block_to_ctile_map.cpp             | 2 +-
 test/conv_util/conv_util.cpp                                    | 2 +-
 test/convnd_bwd_data/convnd_bwd_data.cpp                        | 2 +-
 test/convnd_fwd/convnd_fwd.cpp                                  | 2 +-
 test/data_type/int4.cpp                                         | 2 +-
 .../test_elementwise_layernorm_fp16.cpp                         | 2 +-
 test/gemm/gemm_bf16.cpp                                         | 2 +-
 test/gemm/gemm_fp16.cpp                                         | 2 +-
 test/gemm/gemm_fp32.cpp                                         | 2 +-
 test/gemm/gemm_fp64.cpp                                         | 2 +-
 test/gemm/gemm_int8.cpp                                         | 2 +-
 test/gemm/gemm_standalone_xdl_fp16.cpp                          | 2 +-
 test/gemm/gemm_util.hpp                                         | 2 +-
 test/gemm/instance/gemm_f16_nn_instance.cpp                     | 2 +-
 test/gemm/instance/gemm_f16_nn_instance.hpp                     | 2 +-
 test/gemm/instance/gemm_f16_nt_instance.cpp                     | 2 +-
 test/gemm/instance/gemm_f16_nt_instance.hpp                     | 2 +-
 test/gemm/instance/gemm_f16_tn_instance.cpp                     | 2 +-
 test/gemm/instance/gemm_f16_tn_instance.hpp                     | 2 +-
 test/gemm/instance/gemm_f16_tt_instance.cpp                     | 2 +-
 test/gemm/instance/gemm_f16_tt_instance.hpp                     | 2 +-
 test/gemm/instance/gemm_wavelet_f16_tn_instance.cpp             | 2 +-
 test/gemm/instance/gemm_wavelet_f16_tn_instance.hpp             | 2 +-
 test/gemm/run_gemm_test.inc                                     | 2 +-
 test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16.cpp   | 2 +-
 test/gemm_reduce/gemm_reduce_fp16.cpp                           | 2 +-
 test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp    | 2 +-
 test/grouped_convnd_fwd/grouped_convnd_fwd.cpp                  | 2 +-
 test/magic_number_division/magic_number_division.cpp            | 2 +-
 test/normalization/test_groupnorm_fp16.cpp                      | 2 +-
 test/normalization/test_groupnorm_fp32.cpp                      | 2 +-
 test/normalization/test_layernorm2d_fp16.cpp                    | 2 +-
 test/normalization/test_layernorm2d_fp32.cpp                    | 2 +-
 test/pool_fwd/test_avg_pool2d_fwd.cpp                           | 2 +-
 test/pool_fwd/test_avg_pool3d_fwd.cpp                           | 2 +-
 test/pool_fwd/test_max_pool2d_fwd.cpp                           | 2 +-
 test/pool_fwd/test_max_pool3d_fwd.cpp                           | 2 +-
 test/pool_fwd/test_pool_fwd_common.hpp                          | 2 +-
 test/reduce/reduce_no_index.cpp                                 | 2 +-
 test/reduce/reduce_with_index.cpp                               | 2 +-
 test/reference_conv_fwd/reference_conv_fwd.cpp                  | 2 +-
 test/softmax/test_softmax_interface.cpp                         | 2 +-
 test/softmax/test_softmax_rank3.cpp                             | 2 +-
 test/softmax/test_softmax_rank4.cpp                             | 2 +-
 test/softmax/test_softmax_util.hpp                              | 2 +-
 test/space_filling_curve/space_filling_curve.cpp                | 2 +-
 test/wmma_op/wmma_op.cpp                                        | 2 +-
 test/wmma_op/wmma_op_util.hpp                                   | 2 +-
 1161 files changed, 1161 insertions(+), 1161 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 8ccfe99c3..07d836881 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -4,7 +4,7 @@ This is the list of developers and contributors to Composable Kernel library
 
 
 ## Developers
-[Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2022
+[Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2023
 
 [Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2022
 
diff --git a/LICENSE b/LICENSE
index 2fe9a8455..e03fddaf7 100644
--- a/LICENSE
+++ b/LICENSE
@@ -7,7 +7,7 @@ Copyright (c) 2020     , Advanced Micro Devices, Inc. (Xiaoyan Zhou)
 Copyright (c) 2021-2022, Advanced Micro Devices, Inc. (Jianfeng Yan)
 
 SPDX-License-Identifier: MIT
-Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/client_example/01_gemm/gemm.cpp b/client_example/01_gemm/gemm.cpp
index ba7118ba3..c37f208db 100644
--- a/client_example/01_gemm/gemm.cpp
+++ b/client_example/01_gemm/gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
index 08f297f58..756889562 100644
--- a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
index 658c1e9e8..5965e9d1d 100644
--- a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
index ea269545a..319fdb0b0 100644
--- a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
index caa657378..1129dfa6b 100644
--- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp b/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp
index d4f0c2048..3d5fb6004 100644
--- a/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/04_contraction/contraction_bilinear_fp32.cpp b/client_example/04_contraction/contraction_bilinear_fp32.cpp
index 91dead41a..89f834b98 100644
--- a/client_example/04_contraction/contraction_bilinear_fp32.cpp
+++ b/client_example/04_contraction/contraction_bilinear_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <numeric>
diff --git a/client_example/04_contraction/contraction_bilinear_fp64.cpp b/client_example/04_contraction/contraction_bilinear_fp64.cpp
index 9238e4cd8..1aa3ba7de 100644
--- a/client_example/04_contraction/contraction_bilinear_fp64.cpp
+++ b/client_example/04_contraction/contraction_bilinear_fp64.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <numeric>
diff --git a/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp b/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp
index 62be3377a..f8ea2258c 100644
--- a/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp
+++ b/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <numeric>
diff --git a/client_example/04_contraction/contraction_scale_fp32.cpp b/client_example/04_contraction/contraction_scale_fp32.cpp
index 4e08ee19c..ba7b0633c 100644
--- a/client_example/04_contraction/contraction_scale_fp32.cpp
+++ b/client_example/04_contraction/contraction_scale_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <numeric>
diff --git a/client_example/04_contraction/contraction_scale_fp64.cpp b/client_example/04_contraction/contraction_scale_fp64.cpp
index 3c36aa21e..24e52eb5a 100644
--- a/client_example/04_contraction/contraction_scale_fp64.cpp
+++ b/client_example/04_contraction/contraction_scale_fp64.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <numeric>
diff --git a/client_example/05_layernorm/layernorm2d.cpp b/client_example/05_layernorm/layernorm2d.cpp
index 856a4cc21..4af4d7abe 100644
--- a/client_example/05_layernorm/layernorm2d.cpp
+++ b/client_example/05_layernorm/layernorm2d.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/06_softmax/softmax4d.cpp b/client_example/06_softmax/softmax4d.cpp
index e939ce8df..987ac9569 100644
--- a/client_example/06_softmax/softmax4d.cpp
+++ b/client_example/06_softmax/softmax4d.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <functional>
 #include <numeric>
diff --git a/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp b/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp
index 9fbdb83b1..70be0101c 100644
--- a/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp
+++ b/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iomanip>
diff --git a/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp b/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
index 0a798be27..57a210fa1 100644
--- a/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
+++ b/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iomanip>
diff --git a/client_example/08_fused_attention/fused_attention.cpp b/client_example/08_fused_attention/fused_attention.cpp
index fe927da12..df6bc11a7 100644
--- a/client_example/08_fused_attention/fused_attention.cpp
+++ b/client_example/08_fused_attention/fused_attention.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/client_example/08_fused_attention/fused_attention_bias.cpp b/client_example/08_fused_attention/fused_attention_bias.cpp
index 3113b7856..6c9f3bc8f 100644
--- a/client_example/08_fused_attention/fused_attention_bias.cpp
+++ b/client_example/08_fused_attention/fused_attention_bias.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
index 43a4779f5..cd504e942 100644
--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
index 2ff91fe96..f4aa3666b 100644
--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
index 6ea5dd223..ebdbbf52c 100644
--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
index 33407c9a1..9d60baee0 100644
--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
index 6f5112470..dd81d9ee6 100644
--- a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
index 6a11f9fc2..9c088a21d 100644
--- a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/09_quantization/gemm_quantization.cpp b/client_example/09_quantization/gemm_quantization.cpp
index 242504b44..b14e68fa0 100644
--- a/client_example/09_quantization/gemm_quantization.cpp
+++ b/client_example/09_quantization/gemm_quantization.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp b/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
index 55c789804..1b2e8abc2 100644
--- a/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
+++ b/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iomanip>
diff --git a/client_example/11_grouped_conv_bwd_weight/common.hpp b/client_example/11_grouped_conv_bwd_weight/common.hpp
index a90626333..62eb7bcf5 100644
--- a/client_example/11_grouped_conv_bwd_weight/common.hpp
+++ b/client_example/11_grouped_conv_bwd_weight/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iomanip>
diff --git a/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp b/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
index de68f46d3..bc4a6fe0b 100644
--- a/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
+++ b/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp b/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
index 8ef21986a..c0140f71c 100644
--- a/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
+++ b/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <functional>
 #include <numeric>
diff --git a/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp b/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
index 322667a46..365373343 100644
--- a/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
+++ b/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <functional>
 #include <numeric>
diff --git a/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp b/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp
index 3117d162d..5e6627ce1 100644
--- a/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp
+++ b/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <functional>
 #include <numeric>
diff --git a/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp b/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp
index 9cfeee1cf..d45782d8e 100644
--- a/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp
+++ b/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <functional>
 #include <numeric>
diff --git a/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp b/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
index 28524a9ee..c74d7c6bd 100644
--- a/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
+++ b/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/15_reduce/reduce_nhwc_c.cpp b/client_example/15_reduce/reduce_nhwc_c.cpp
index 2275158bc..b45b72f0d 100644
--- a/client_example/15_reduce/reduce_nhwc_c.cpp
+++ b/client_example/15_reduce/reduce_nhwc_c.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <functional>
 #include <numeric>
diff --git a/client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp b/client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp
index 223ed29be..7ba3224fc 100644
--- a/client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp
+++ b/client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/18_groupnorm/groupnorm_swish.cpp b/client_example/18_groupnorm/groupnorm_swish.cpp
index 84f62ceac..308061a32 100644
--- a/client_example/18_groupnorm/groupnorm_swish.cpp
+++ b/client_example/18_groupnorm/groupnorm_swish.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/19_pool_fwd/avg_pool3d_fwd.cpp b/client_example/19_pool_fwd/avg_pool3d_fwd.cpp
index 47bd7738f..2edaf474b 100644
--- a/client_example/19_pool_fwd/avg_pool3d_fwd.cpp
+++ b/client_example/19_pool_fwd/avg_pool3d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/19_pool_fwd/max_pool2d_fwd.cpp b/client_example/19_pool_fwd/max_pool2d_fwd.cpp
index 12ee61920..c776dc12d 100644
--- a/client_example/19_pool_fwd/max_pool2d_fwd.cpp
+++ b/client_example/19_pool_fwd/max_pool2d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp
index 495a81596..144c9aacc 100644
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/01_gemm/gemm_dl_fp16.cpp b/example/01_gemm/gemm_dl_fp16.cpp
index cf585a8c5..b5fecb975 100644
--- a/example/01_gemm/gemm_dl_fp16.cpp
+++ b/example/01_gemm/gemm_dl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_dl_fp32.cpp b/example/01_gemm/gemm_dl_fp32.cpp
index 93f085cde..212b72f2a 100644
--- a/example/01_gemm/gemm_dl_fp32.cpp
+++ b/example/01_gemm/gemm_dl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_dl_int4.cpp b/example/01_gemm/gemm_dl_int4.cpp
index e392c490f..e55ae1401 100644
--- a/example/01_gemm/gemm_dl_int4.cpp
+++ b/example/01_gemm/gemm_dl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 #error Should compile this file with ck::int4_t support
diff --git a/example/01_gemm/gemm_dl_int8.cpp b/example/01_gemm/gemm_dl_int8.cpp
index be9e38771..1840390aa 100644
--- a/example/01_gemm/gemm_dl_int8.cpp
+++ b/example/01_gemm/gemm_dl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp
index 58f965be8..b11fe76ab 100644
--- a/example/01_gemm/gemm_wmma_fp16.cpp
+++ b/example/01_gemm/gemm_wmma_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp
index 9aaae6ade..3cac55ef4 100644
--- a/example/01_gemm/gemm_xdl_bf16.cpp
+++ b/example/01_gemm/gemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp
index 50d35fd9a..54fbd9cdd 100644
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index 99253b743..836157629 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_xdl_int4.cpp b/example/01_gemm/gemm_xdl_int4.cpp
index 7f1283a47..f6238c7aa 100644
--- a/example/01_gemm/gemm_xdl_int4.cpp
+++ b/example/01_gemm/gemm_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 #error Should compile this file with ck::int4_t support
diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp
index e67594c5b..cc03200b9 100644
--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
index 12a699259..3afd0ebdb 100644
--- a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
index 3a0ddd90b..d7176f75d 100644
--- a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc
index 4e2cedb52..38c72afc6 100644
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
index ff99bf464..877792d74 100644
--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
index 917b6b1c3..c3e6ef7d5 100644
--- a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
index aee51d05d..dffeff233 100644
--- a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
+++ b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/04_gemm_add_add_fastgelu/common.hpp b/example/04_gemm_add_add_fastgelu/common.hpp
index 839587c14..91d17df95 100644
--- a/example/04_gemm_add_add_fastgelu/common.hpp
+++ b/example/04_gemm_add_add_fastgelu/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
index ba0476b9b..e630f6783 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
index b940bfd89..71f6677ba 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
index fa651a34e..4665c3932 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp
index 9f9c423de..f206bbeb4 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 #error Should compile this file with ck::int4_t support
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
index fadc4ef5e..e46483ab3 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_common.hpp b/example/09_convnd_fwd/convnd_fwd_common.hpp
index 4c594ccdf..109b8f9ee 100644
--- a/example/09_convnd_fwd/convnd_fwd_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/example/09_convnd_fwd/convnd_fwd_dl_common.hpp b/example/09_convnd_fwd/convnd_fwd_dl_common.hpp
index 855710b9d..aeddd4fc5 100644
--- a/example/09_convnd_fwd/convnd_fwd_dl_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp b/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
index db5a7f0bc..7b6f18f46 100644
--- a/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_dl_common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp b/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
index 964d784c8..551655b17 100644
--- a/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_dl_common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp b/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
index b0cd88f21..27a3f2e2a 100644
--- a/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_dl_common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
index d55d31549..74cf91d16 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
index d84afba64..f6d69bafd 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
index f5acc540c..6c3171f61 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
index 8d697976a..9977a496d 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
index 99f7f2565..bf084b3cc 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
diff --git a/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc b/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
index 697ada14b..6474df1c3 100644
--- a/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
+++ b/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/09_convnd_fwd/run_convnd_fwd_example.inc b/example/09_convnd_fwd/run_convnd_fwd_example.inc
index 36a68056f..49852ff66 100644
--- a/example/09_convnd_fwd/run_convnd_fwd_example.inc
+++ b/example/09_convnd_fwd/run_convnd_fwd_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
index 00e370f29..137b0d1ff 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cassert>
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp
index 6ff29b4b0..4ccacb0bc 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp
index 02c19c2b6..bf495725e 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
index 679bb5c0c..584878567 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp
index abdbdaf74..bf7127502 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 #error Should compile this file with ck::int4_t support
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp
index cf86afa8e..3e1694cbe 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
index b3a389178..cebfeb51d 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp
index a7ee9990c..9a736d4cf 100644
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <initializer_list>
diff --git a/example/12_reduce/reduce_blockwise_impl.hpp b/example/12_reduce/reduce_blockwise_impl.hpp
index e6e3cc8d5..7f8394a73 100644
--- a/example/12_reduce/reduce_blockwise_impl.hpp
+++ b/example/12_reduce/reduce_blockwise_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp
index dbb18a0d8..eb8b5c76d 100644
--- a/example/12_reduce/reduce_blockwise_two_call.cpp
+++ b/example/12_reduce/reduce_blockwise_two_call.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/12_reduce/reduce_example_common.hpp b/example/12_reduce/reduce_example_common.hpp
index 05f0a0edb..5f9a48804 100644
--- a/example/12_reduce/reduce_example_common.hpp
+++ b/example/12_reduce/reduce_example_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/12_reduce/reduce_multiblock_atomic_add.cpp b/example/12_reduce/reduce_multiblock_atomic_add.cpp
index c4d63a3ad..120e3f059 100644
--- a/example/12_reduce/reduce_multiblock_atomic_add.cpp
+++ b/example/12_reduce/reduce_multiblock_atomic_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <initializer_list>
diff --git a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
index 905242fb6..fed621864 100644
--- a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
+++ b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/13_pool2d_fwd/pool2d_fwd_common.hpp b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
index 9abc98671..1157ccd38 100644
--- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
index 20c3e4701..daf8540d4 100644
--- a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
index 34ff6f435..323e3f61f 100644
--- a/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp b/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
index 044f3c166..2585072df 100644
--- a/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp b/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
index d5f4e6f62..aa3e01169 100644
--- a/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp b/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
index 237173738..4b207df5c 100644
--- a/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp
index a5c51ceb0..3e1f7f089 100644
--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cstddef>
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
index 05d572a1f..680cee1f8 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
index 3f78dafa8..90a12bc1d 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
index fd93bb5f8..28b0fcd0c 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
index faf41bbf0..60c4a71a3 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
index 7cb09778c..0c96ef56d 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
index a89937b2e..743ab96be 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
index eb3832a66..2f6533d44 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
index e1248002f..b28e7f85d 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
index c2feffeb8..b30ce2c48 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
index 363390add..31e2efd6f 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
index de6b7eb48..d3c7c1d99 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
index 9666fc662..9a4a6bc6e 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
index 00e0b767a..1a8457a8b 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
index 652c0e6ea..5c2706c79 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
index 7eee24fed..c119e2437 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
index c250b9969..0f5e58838 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp b/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
index 62992de59..1bea1bcf3 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <numeric>
 #include <initializer_list>
diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
index 26fa9e982..b4b544aab 100644
--- a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp b/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp
index f0896e977..6b84eaba4 100644
--- a/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_bwd_data_common.hpp"
 
diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp b/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp
index c4f2c1f02..c9989c60a 100644
--- a/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_bwd_data_common.hpp"
 
diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
index c2e3602a7..e363dc5c1 100644
--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
index bee5dea54..24c8d82f6 100644
--- a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
+++ b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <cstdlib>
diff --git a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
index 6fc63b899..3c04c5614 100644
--- a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
+++ b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <cstdlib>
diff --git a/example/19_binary_elementwise/elementwise_add_1d.cpp b/example/19_binary_elementwise/elementwise_add_1d.cpp
index a5a6bc0a8..1ac09641a 100644
--- a/example/19_binary_elementwise/elementwise_add_1d.cpp
+++ b/example/19_binary_elementwise/elementwise_add_1d.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <cstdlib>
diff --git a/example/19_binary_elementwise/elementwise_add_4d.cpp b/example/19_binary_elementwise/elementwise_add_4d.cpp
index cc209b12e..e571aa846 100644
--- a/example/19_binary_elementwise/elementwise_add_4d.cpp
+++ b/example/19_binary_elementwise/elementwise_add_4d.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <cstdlib>
diff --git a/example/20_grouped_conv_bwd_weight/common.hpp b/example/20_grouped_conv_bwd_weight/common.hpp
index 3f4818d2e..15727495f 100644
--- a/example/20_grouped_conv_bwd_weight/common.hpp
+++ b/example/20_grouped_conv_bwd_weight/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
index aed6d22b0..3cd70d0f3 100644
--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
index 4a2a6195d..966b58676 100644
--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
index d46b37476..39b9100bf 100644
--- a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
+++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 template <ck::index_t NDimSpatial>
 using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
index 192fe87b6..96d04dcb3 100644
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
index 3f01e6947..fc58ca19f 100644
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
index 4da6da65f..bd1d6932a 100644
--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
index e7d857c4a..90d80f9f0 100644
--- a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/22_cgemm/cgemm_xdl_bf16.cpp b/example/22_cgemm/cgemm_xdl_bf16.cpp
index 92ed90ce4..fa4482a98 100644
--- a/example/22_cgemm/cgemm_xdl_bf16.cpp
+++ b/example/22_cgemm/cgemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/example/22_cgemm/cgemm_xdl_common.hpp b/example/22_cgemm/cgemm_xdl_common.hpp
index 6aa06b7c3..26137a7c2 100644
--- a/example/22_cgemm/cgemm_xdl_common.hpp
+++ b/example/22_cgemm/cgemm_xdl_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <numeric>
 #include <initializer_list>
diff --git a/example/22_cgemm/cgemm_xdl_fp16.cpp b/example/22_cgemm/cgemm_xdl_fp16.cpp
index 11373736e..89a581e86 100644
--- a/example/22_cgemm/cgemm_xdl_fp16.cpp
+++ b/example/22_cgemm/cgemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/example/22_cgemm/cgemm_xdl_fp32.cpp b/example/22_cgemm/cgemm_xdl_fp32.cpp
index 0f45c18c4..cf9659959 100644
--- a/example/22_cgemm/cgemm_xdl_fp32.cpp
+++ b/example/22_cgemm/cgemm_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/example/22_cgemm/cgemm_xdl_int4.cpp b/example/22_cgemm/cgemm_xdl_int4.cpp
index c26a83baa..f69cc2b3c 100644
--- a/example/22_cgemm/cgemm_xdl_int4.cpp
+++ b/example/22_cgemm/cgemm_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/example/22_cgemm/cgemm_xdl_int8.cpp b/example/22_cgemm/cgemm_xdl_int8.cpp
index 2f2418986..c4835b853 100644
--- a/example/22_cgemm/cgemm_xdl_int8.cpp
+++ b/example/22_cgemm/cgemm_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/example/23_softmax/softmax_blockwise.cpp b/example/23_softmax/softmax_blockwise.cpp
index 41afd72f5..d09e434bc 100644
--- a/example/23_softmax/softmax_blockwise.cpp
+++ b/example/23_softmax/softmax_blockwise.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
index c934d3501..420a7cf74 100644
--- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
index 98835f98f..9d606db20 100644
--- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
index 6004db6e0..78522160c 100644
--- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
index 9576ce3f2..6cceed5bc 100644
--- a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/26_contraction/contraction_scale_xdl_fp32.cpp b/example/26_contraction/contraction_scale_xdl_fp32.cpp
index 3aa2a7ba9..1574f5d18 100644
--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/26_contraction/contraction_scale_xdl_fp64.cpp b/example/26_contraction/contraction_scale_xdl_fp64.cpp
index cccf6505c..3dacc7088 100644
--- a/example/26_contraction/contraction_scale_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/27_layernorm/common.hpp b/example/27_layernorm/common.hpp
index 8d833a3ae..62a71713d 100644
--- a/example/27_layernorm/common.hpp
+++ b/example/27_layernorm/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/27_layernorm/layernorm_fp16.cpp b/example/27_layernorm/layernorm_fp16.cpp
index c15ffabf5..bb8b954f0 100644
--- a/example/27_layernorm/layernorm_fp16.cpp
+++ b/example/27_layernorm/layernorm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/27_layernorm/layernorm_splitk_fp16.cpp b/example/27_layernorm/layernorm_splitk_fp16.cpp
index 01ee7161e..e0378d028 100644
--- a/example/27_layernorm/layernorm_splitk_fp16.cpp
+++ b/example/27_layernorm/layernorm_splitk_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/27_layernorm/run_layernorm_example.inc b/example/27_layernorm/run_layernorm_example.inc
index 678d8df28..95200b540 100644
--- a/example/27_layernorm/run_layernorm_example.inc
+++ b/example/27_layernorm/run_layernorm_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
index f8e6501ea..24e9b1d9b 100644
--- a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
index 30ad38a56..62233e535 100644
--- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
+++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
index 25d815b9c..08158bfc2 100644
--- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/30_grouped_conv_fwd_multiple_d/common.hpp b/example/30_grouped_conv_fwd_multiple_d/common.hpp
index e7c6ed9b9..e60ebee6e 100644
--- a/example/30_grouped_conv_fwd_multiple_d/common.hpp
+++ b/example/30_grouped_conv_fwd_multiple_d/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
index eb6975a6d..ae769ff1d 100644
--- a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
+++ b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
index 9d1d257a2..039d25029 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common_wmma.hpp"
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
index ee300d073..43c0d57dc 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
index 5a9df0b1e..40b4132b3 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
index c2906cc9d..e05d384f2 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
index 3d5a243e6..5494563fd 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 #error Should compile this file with ck::int4_t support
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
index eaf680fa4..6bf2e8d96 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
index 4561156e0..eb242203e 100644
--- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 template <typename BiasLay, typename ResidualLay>
 struct LayoutSetting
diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
index a6888649c..360b2c894 100644
--- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 template <typename BiasLay, typename ResidualLay>
 struct LayoutSetting
diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
index d087c31af..58ed69182 100644
--- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 template <ck::index_t NDimSpatial>
 using DeviceConvFwdInstance =
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
index 74e0e07e6..7605d9c4f 100644
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
index d5fadb808..33ed04fb3 100644
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
index 0dd4e0914..e0eb193ad 100644
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp
index 1fd93622a..d166214c3 100644
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp
index 15d98abab..40f87d1f5 100644
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
diff --git a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
index 7e5f1614b..f32914672 100644
--- a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
+++ b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
index 0eb156533..1d1566d57 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp
index 8f1db577c..bae88d4b8 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
index 2ce91a8c6..a098ce667 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp
index 1fd2bf693..ce8caf758 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
index f4a858905..138db1496 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
index e4a71b043..d0eb8fcc3 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
index 38b5badc6..1d97474d2 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc
index 4e43dbdd8..27602e231 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 int run(int argc, char* argv[])
 {
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
index 0b876af95..fa76faea8 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 int run(int argc, char* argv[])
 {
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
index ef2acf61f..ea1e2734a 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 int run(int argc, char* argv[])
 {
diff --git a/example/33_multiple_reduce/dual_reduce_common.hpp b/example/33_multiple_reduce/dual_reduce_common.hpp
index 326606752..cd21790be 100644
--- a/example/33_multiple_reduce/dual_reduce_common.hpp
+++ b/example/33_multiple_reduce/dual_reduce_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <cstdlib>
diff --git a/example/33_multiple_reduce/dual_reduce_multiblock.cpp b/example/33_multiple_reduce/dual_reduce_multiblock.cpp
index 9360599ed..198931749 100644
--- a/example/33_multiple_reduce/dual_reduce_multiblock.cpp
+++ b/example/33_multiple_reduce/dual_reduce_multiblock.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <cstdlib>
diff --git a/example/33_multiple_reduce/dual_reduce_threadwise.cpp b/example/33_multiple_reduce/dual_reduce_threadwise.cpp
index 56255839e..7609edad3 100644
--- a/example/33_multiple_reduce/dual_reduce_threadwise.cpp
+++ b/example/33_multiple_reduce/dual_reduce_threadwise.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <cstdlib>
diff --git a/example/34_batchnorm/batchnorm_backward_nhwc.cpp b/example/34_batchnorm/batchnorm_backward_nhwc.cpp
index a6ca9d150..3756310fd 100644
--- a/example/34_batchnorm/batchnorm_backward_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_backward_nhwc.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <limits>
 #include <iostream>
diff --git a/example/34_batchnorm/batchnorm_common.hpp b/example/34_batchnorm/batchnorm_common.hpp
index bdbc8ea8b..a1b8d253b 100644
--- a/example/34_batchnorm/batchnorm_common.hpp
+++ b/example/34_batchnorm/batchnorm_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
index dc2984851..6a8002025 100644
--- a/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <limits>
 #include <iostream>
diff --git a/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
index da36d65a2..d68081815 100644
--- a/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <limits>
 #include <iostream>
diff --git a/example/34_batchnorm/batchnorm_infer_impl.hpp b/example/34_batchnorm/batchnorm_infer_impl.hpp
index 15170586b..d0b545b2a 100644
--- a/example/34_batchnorm/batchnorm_infer_impl.hpp
+++ b/example/34_batchnorm/batchnorm_infer_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
index 7191ecf50..1dc21a6c2 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
index efdb315b4..74fb16e15 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
index bc2e3d1d5..7506f6942 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
index 4eb278246..7ebf91440 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
index eefdbca6b..0fc7a5cc2 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
index f0a0cdf6f..d2337dcda 100644
--- a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
+++ b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
index 071e8a743..b3d0ab6bf 100644
--- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
+++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o]
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
index d07ee7bdc..ca824b107 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
+++ b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_fp16.cpp b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_fp16.cpp
index 55ea8c3a3..a3533bb4c 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_fp16.cpp
+++ b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_fp16.cpp b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_fp16.cpp
index ddf82ec51..fb688b6f3 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_fp16.cpp
+++ b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc
index 0afd8bd70..0f0b120cb 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc
+++ b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 bool run_conv_bwd_data_bias_relu(const ExecutionConfig& config,
                                  const ck::utils::conv::ConvParam& conv_params,
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc
index e50c98bbe..25678491c 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc
+++ b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 bool run_conv_bwd_data(const ExecutionConfig& config,
                        const ck::utils::conv::ConvParam& conv_params,
diff --git a/example/39_permute/common.hpp b/example/39_permute/common.hpp
index ab612cea1..54f3a7880 100644
--- a/example/39_permute/common.hpp
+++ b/example/39_permute/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/39_permute/permute_1xHxW_fp16.cpp b/example/39_permute/permute_1xHxW_fp16.cpp
index d7f9b8054..7336c3b63 100644
--- a/example/39_permute/permute_1xHxW_fp16.cpp
+++ b/example/39_permute/permute_1xHxW_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/39_permute/permute_HxWx4_fp16.cpp b/example/39_permute/permute_HxWx4_fp16.cpp
index 342aa134e..6c24919de 100644
--- a/example/39_permute/permute_HxWx4_fp16.cpp
+++ b/example/39_permute/permute_HxWx4_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/39_permute/permute_NxHxW_fp16.cpp b/example/39_permute/permute_NxHxW_fp16.cpp
index b53975eb2..3551d2a7c 100644
--- a/example/39_permute/permute_NxHxW_fp16.cpp
+++ b/example/39_permute/permute_NxHxW_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/39_permute/run_permute_bundle_example.inc b/example/39_permute/run_permute_bundle_example.inc
index 70406d63f..2c1987292 100644
--- a/example/39_permute/run_permute_bundle_example.inc
+++ b/example/39_permute/run_permute_bundle_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/39_permute/run_permute_element_example.inc b/example/39_permute/run_permute_element_example.inc
index bc6235303..358713445 100644
--- a/example/39_permute/run_permute_element_example.inc
+++ b/example/39_permute/run_permute_element_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/40_conv2d_fwd_quantization/common.hpp b/example/40_conv2d_fwd_quantization/common.hpp
index 6ee14d750..266b09145 100644
--- a/example/40_conv2d_fwd_quantization/common.hpp
+++ b/example/40_conv2d_fwd_quantization/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
index 5c445d9c5..40b33852b 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
index 0ff85f008..fc081ddc5 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
index f8f996d17..c390f016a 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
index 3b25fec0c..10b131a52 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
index a98a1e240..e59d0d075 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
index 262594d58..aee5fe9e6 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
index 6b2205505..06c839e4e 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
index 1ac867974..7a9b42d39 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
index f28abe5eb..349563629 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
index f468e8adc..261133725 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
index 5675db77f..e5b924ad5 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 template <ck::index_t NDimSpatial,
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
index 9fd19c1c4..9f3a769dc 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
index cacedfdad..9b08fc690 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
index 77332cb6d..267c737e0 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
index 2aea08c40..e37d41369 100644
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
index b7f80e76d..496e676a4 100644
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
index 15e460948..35d50721d 100644
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp
index 2cc4c07c0..80f6e9ae0 100644
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 #error Should compile this file with ck::int4_t support
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp
index 40ff0f69c..3ade6c811 100644
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc b/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
index a2c97f4d4..0722d497d 100644
--- a/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
+++ b/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/42_groupnorm/common.hpp b/example/42_groupnorm/common.hpp
index 780154b26..c8f91eb53 100644
--- a/example/42_groupnorm/common.hpp
+++ b/example/42_groupnorm/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
index b07a26c4c..cc107b63d 100644
--- a/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/42_groupnorm/groupnorm_splitk_fp16.cpp b/example/42_groupnorm/groupnorm_splitk_fp16.cpp
index fd4bfe380..057b240a6 100644
--- a/example/42_groupnorm/groupnorm_splitk_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_splitk_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/42_groupnorm/groupnorm_swish_fp16.cpp b/example/42_groupnorm/groupnorm_swish_fp16.cpp
index c52243bfb..363f22ed4 100644
--- a/example/42_groupnorm/groupnorm_swish_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_swish_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/42_groupnorm/run_groupnorm_example.inc b/example/42_groupnorm/run_groupnorm_example.inc
index d1016a3b1..16065c8d4 100644
--- a/example/42_groupnorm/run_groupnorm_example.inc
+++ b/example/42_groupnorm/run_groupnorm_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
index 7ac4b6827..b6d9b29a5 100644
--- a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
index 764e55ef5..60a0e01fe 100644
--- a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
+++ b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
index 7d6ff12ee..76361f87a 100644
--- a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
+++ b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/46_gemm_add_multiply/common.hpp b/example/46_gemm_add_multiply/common.hpp
index 3ba78dfe4..2c656cf44 100644
--- a/example/46_gemm_add_multiply/common.hpp
+++ b/example/46_gemm_add_multiply/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp b/example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp
index 28c3939fa..58a399f22 100644
--- a/example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp
+++ b/example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp"
diff --git a/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp b/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
index d5aa41f1b..56417b101 100644
--- a/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
+++ b/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
diff --git a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
index 30c98e534..cfb42c6e1 100644
--- a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
+++ b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/example/48_pool3d_fwd/pool3d_fwd_common.hpp b/example/48_pool3d_fwd/pool3d_fwd_common.hpp
index 5706deb6d..565bb94e4 100644
--- a/example/48_pool3d_fwd/pool3d_fwd_common.hpp
+++ b/example/48_pool3d_fwd/pool3d_fwd_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 
 #include <iostream>
diff --git a/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp b/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
index 4d3686bcb..9afb51201 100644
--- a/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
+++ b/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 1626597ed..314e6a813 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp
index e2cbdb733..bd02d5d88 100644
--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/host_utility/hip_check_error.hpp b/include/ck/host_utility/hip_check_error.hpp
index d3dc8eaf1..af7bebd9d 100644
--- a/include/ck/host_utility/hip_check_error.hpp
+++ b/include/ck/host_utility/hip_check_error.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/host_utility/io.hpp b/include/ck/host_utility/io.hpp
index ac8719592..55734bab2 100644
--- a/include/ck/host_utility/io.hpp
+++ b/include/ck/host_utility/io.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp
index 24f212167..58740b435 100644
--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp b/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
index 412675365..6b118e972 100644
--- a/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_TRANSFORM_FORWARD_CONVOLUTION3D_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
 #define CK_TRANSFORM_FORWARD_CONVOLUTION3D_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
diff --git a/include/ck/stream_config.hpp b/include/ck/stream_config.hpp
index 70ca34555..505a602b2 100644
--- a/include/ck/stream_config.hpp
+++ b/include/ck/stream_config.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor/static_tensor.hpp b/include/ck/tensor/static_tensor.hpp
index fee679f91..d719ef976 100644
--- a/include/ck/tensor/static_tensor.hpp
+++ b/include/ck/tensor/static_tensor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_STATIC_TENSOR_HPP
 #define CK_STATIC_TENSOR_HPP
diff --git a/include/ck/tensor_description/cluster_descriptor.hpp b/include/ck/tensor_description/cluster_descriptor.hpp
index 0c9ea2ff2..2dfcad8e0 100644
--- a/include/ck/tensor_description/cluster_descriptor.hpp
+++ b/include/ck/tensor_description/cluster_descriptor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_description/multi_index_transform.hpp b/include/ck/tensor_description/multi_index_transform.hpp
index 4e4d7593e..6854226dd 100644
--- a/include/ck/tensor_description/multi_index_transform.hpp
+++ b/include/ck/tensor_description/multi_index_transform.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_description/multi_index_transform_helper.hpp b/include/ck/tensor_description/multi_index_transform_helper.hpp
index 044a90370..af0a8a34d 100644
--- a/include/ck/tensor_description/multi_index_transform_helper.hpp
+++ b/include/ck/tensor_description/multi_index_transform_helper.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_description/tensor_adaptor.hpp b/include/ck/tensor_description/tensor_adaptor.hpp
index d42e0a6ff..3ffac3246 100644
--- a/include/ck/tensor_description/tensor_adaptor.hpp
+++ b/include/ck/tensor_description/tensor_adaptor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_description/tensor_descriptor.hpp b/include/ck/tensor_description/tensor_descriptor.hpp
index f07d5b173..f1df2eedd 100644
--- a/include/ck/tensor_description/tensor_descriptor.hpp
+++ b/include/ck/tensor_description/tensor_descriptor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_description/tensor_descriptor_helper.hpp b/include/ck/tensor_description/tensor_descriptor_helper.hpp
index 461aae72c..f3ac041bf 100644
--- a/include/ck/tensor_description/tensor_descriptor_helper.hpp
+++ b/include/ck/tensor_description/tensor_descriptor_helper.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_description/tensor_space_filling_curve.hpp b/include/ck/tensor_description/tensor_space_filling_curve.hpp
index 17c9100b9..9a326092d 100644
--- a/include/ck/tensor_description/tensor_space_filling_curve.hpp
+++ b/include/ck/tensor_description/tensor_space_filling_curve.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
index 8b1b7be11..b3caa3214 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
index 33120bd86..b0143366c 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
 #define CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
index f45655721..0d092da51 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
 #define CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
index d75f37d7b..5ec964bd3 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
index 5328dfde9..d5a64d7aa 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
index aa814ab00..8ae1ba3f3 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp b/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
index 7e62a822a..82bcff694 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
index 03e4d42d3..d8da134a3 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp b/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
index 316508651..a3813ea24 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
index 2163ad323..6c13513cf 100644
--- a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
index 04ad75bd7..c8690e5f6 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
index 5c47a49b3..905a59f56 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
index aa33fc083..17110c835 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
index eb5f589a4..9a5317dd1 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
index 3bd780638..993d90e35 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp b/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
index a4a29f5d5..f3263c721 100644
--- a/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp b/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
index 20b2a152b..01bb80678 100644
--- a/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
index 953ff1e06..adfa1689c 100644
--- a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp
index 5946daf21..198169011 100644
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp
index 9fcd893c7..ee7af0117 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
index e75591328..6cc2c7bb2 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp
index af681127f..91b4b6b91 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
index 116e62c00..f18dc3290 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp
index eacc5976d..8234e2948 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
index c1f85e575..09259224e 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
index bde71806d..be8105c96 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp b/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
index d39f3b7cb..2c0da6925 100644
--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp b/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
index aa93dd9c1..e3962e177 100644
--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp b/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
index 8a00fd9db..69103b6f4 100644
--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_cgemm.hpp b/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
index aedae5380..848421211 100644
--- a/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 #include "device_base.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
index dbc525c09..118ade897 100644
--- a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp b/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp
index 82054a3c9..eb1b85ec8 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp b/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp
index 4b9881088..4dc11dbef 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp b/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp
index 5a627deeb..7d3845666 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp b/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp
index cc139303c..3a49ac632 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp
index f9f913a7c..db0e4bd83 100644
--- a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
index 9491a9224..c56a947ec 100644
--- a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_gemm.hpp
index c0af6f80f..adf909821 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp
index 4c2161eae..a7f42c3b3 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
index 9113bb7b7..a44356dc2 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp
index a67a09b87..0258858fe 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp
index f4881e32f..539e83f7c 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
index fcc088ca4..eaa7671c6 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
index c701bff57..6407aa7e0 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
index af38f1425..d00e19b44 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp
index 173c613a3..ba8194844 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
index 3350aec8d..7e4bca2bd 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
index 1258aed71..de54f9be2 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
index 644c7ee9a..025c43e75 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
index 618033335..f26974ccb 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
index 80c864c83..908093658 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
index b066a4458..fae650974 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index 0b1db2846..9c8b5f462 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp
index ee4b53e2f..f68022ca0 100644
--- a/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_normalization.hpp
index 03601ce83..1f178f9fc 100644
--- a/include/ck/tensor_operation/gpu/device/device_normalization.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_permute.hpp b/include/ck/tensor_operation/gpu/device/device_permute.hpp
index 9daa2be37..c994cf02c 100644
--- a/include/ck/tensor_operation/gpu/device/device_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp b/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
index e801e98a2..8b227fdfb 100644
--- a/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce.hpp
index c9209f2d7..c2721b184 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_softmax.hpp b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
index 94f788e51..a96ba89e2 100644
--- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp
index f59e6093e..eeccd977c 100644
--- a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
index 70990e795..95517b107 100644
--- a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
index fc913e9ba..0bb45b18c 100644
--- a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
index 493822aeb..4d599e801 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
index 9bf8f5ccd..bc86e78b6 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
index 20e9920d9..09220813b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
index 0df346094..2d91c620c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
index 196dc86da..e39d8f069 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
index ef9b90ba7..eff503f65 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index 0c6c0ef7a..e54c013cf 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
index 84edde63e..9310d0752 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
index d35f19417..b10096706 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp
index ab16a757f..f46237e00 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
index 5a16ff765..ce0320b2d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
index 0c9cce97f..9ee6c6f46 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
index 1eaffe705..b6f38698c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index b65afce8d..2ab09ba5c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
index ea3020663..8cef0eaf9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
index e7e2bf335..bb8f53161 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
index 6c4957b9b..1bd1e553c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index 027f1a195..de6bf27fe 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
index 6278220c2..710ea9176 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
index f69d8f18a..cd89f3232 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef DEVICE_CONV3D_FWD_NAIVE_HPP
 #define DEVICE_CONV3D_FWD_NAIVE_HPP
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
index d52879cd9..fd8c88da9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef DEVICE_CONV3D_FWD_XDL_HPP
 #define DEVICE_CONV3D_FWD_XDL_HPP
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
index aff25aa7c..3178f73f4 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
index 1fd4b76ce..822e1da4e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp
index 83ed6198b..c5f90e40f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp
index a11b5d039..5618fba51 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp
index 1fa69288a..c3416758d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
index a9a58c8ac..63f7fa706 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
index 36366a763..13e9f9691 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
index c91328ff7..22f66931f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
index 580087e00..1ab836247 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
index 4c1c3ab7b..be174e599 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
index 750df31a3..44b3518e2 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
index 248810148..bd5be99f8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp
index f358bd176..e60fae370 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
index 03ffcf8e5..f64450b75 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
index a5051455b..528d9bf42 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
index 13a30911a..a3dda82aa 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
index 8ee138f82..14ac5420a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
index 36b01f677..ef5e26781 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
index 89bfc180a..380199341 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
index 76dd5a366..a275ee102 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 92c20a308..3f6238d21 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
index c921c9f1b..3bcc7bd64 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp
index 03185d5b1..face627e1 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
index de40d7129..71e4e28bf 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
index 9d4b68c0b..526108b87 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
index e88bf8ed7..d1f1b7fcc 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
@@ -1,6 +1,6 @@
 #pragma once
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
index e3795060b..77a8e6ecb 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -1,6 +1,6 @@
 #pragma once
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index 54ad9eb06..cd39cc983 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp
index b49e10968..aec5a65cc 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp
index 17a96e9f6..6d1d5c8e2 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
index 6a8037a32..ea0d80504 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
index 0026a8759..8b2b3c41b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp
index 7b96373c0..17dab0833 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp b/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
index 6933db68d..3f27c629d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp b/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
index d330fda8c..0ab6c2475 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
index 5dc051be3..2481c5c76 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
index c7868537f..bf3deeb57 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
index aa255da64..6c5895b01 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
index ed96b7340..4aa02dfd3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
index 2f29224a7..7a62ec046 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
index ea0f5897a..d6d6f74ab 100644
--- a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/matrix_padder.hpp b/include/ck/tensor_operation/gpu/device/matrix_padder.hpp
index 70e61bc77..c66d2fc51 100644
--- a/include/ck/tensor_operation/gpu/device/matrix_padder.hpp
+++ b/include/ck/tensor_operation/gpu/device/matrix_padder.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp b/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
index d35318357..5351d4ef2 100644
--- a/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
+++ b/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
index b44427411..b2d141fd6 100644
--- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/tensor_specialization.hpp b/include/ck/tensor_operation/gpu/device/tensor_specialization.hpp
index 0ec0df2c9..713fc93eb 100644
--- a/include/ck/tensor_operation/gpu/device/tensor_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_specialization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/welford_helper.hpp b/include/ck/tensor_operation/gpu/device/welford_helper.hpp
index 6c909b767..d7772d876 100644
--- a/include/ck/tensor_operation/gpu/device/welford_helper.hpp
+++ b/include/ck/tensor_operation/gpu/device/welford_helper.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
index 136017c6d..1dd96809d 100644
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
index ceb2b665b..3fdb391a0 100644
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index ef250b8bf..c3e7706ef 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
index a72a4ee06..e73e7e681 100644
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
index 08cb0dd19..fc263138f 100644
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
index 548d7fd40..1f8990e6d 100644
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp
index 42b7e172b..6fe78edb3 100644
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
index ad91c3c68..91ed1112f 100644
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
index aa34cfbf8..523e7f7c5 100644
--- a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_welford_second_half_layernorm2d.hpp b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_welford_second_half_layernorm2d.hpp
index fbe89e7e5..69468c25b 100644
--- a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_welford_second_half_layernorm2d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_welford_second_half_layernorm2d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp
index bdebe3816..bd1e0585f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp
index 1313ec943..fc4f27e33 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
index 6836a6604..203be3c42 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
index 5986641c6..910c926c7 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
index fccb127d0..a8a1f803f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
index b9f4a3080..59d6bad5d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
index 6a6f19d71..74171ea9d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
index d6d205111..9eb2bf8aa 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
index ede6a96dc..ed1ffdd85 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp
index 33c45a0f0..b6c83af13 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
index 8b82b6554..d686c14b3 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
index 05257d162..bf0e8c186 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 //
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp
index b09a73590..3ea72b853 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
index bebcdceb4..c5c09e909 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
index 9c68b4f5c..27f48a84b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
index 578665ea8..7289a20da 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
index d3f81566e..119c1ea59 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
index 98331d854..9090c16dc 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
index 3281b910d..f54345b04 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
index a3f532471..bb3e6a80b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
index aa89bff9e..e7577bdcb 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_waveletmodel.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_waveletmodel.hpp
index 2d3a36fca..de5a42419 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_waveletmodel.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_waveletmodel.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
index 397ae1c1b..740dde5c6 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
index e6303d76c..d5552656e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
index 2d4ebe707..d805c9fa2 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
index acece0fbb..31c59d14e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
index 8d86f3c1d..e9881d645 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
index 55f465a03..33a4f2b2c 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
index d56d1986e..a4b320ddf 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
index 8259927fe..df543c063 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
index 5d5fdae17..ec98fc9c9 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
index dc83f8e98..3a752dd74 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
index de1ae9159..61d0f9e0d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
index 901e7aee9..41352fabe 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp
index 88c7b6acf..0ad36b418 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
index 0344e6830..5f56ac6fc 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp
index ff2511fa6..ee68660a0 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp
index 792ffabcb..c3f122106 100644
--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp
index 632690e1e..e50fb9813 100644
--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp
index 129b4e116..fc42e9762 100644
--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp
index d796d1afc..136ac94e7 100644
--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp
index 3a7ae459e..ff9712276 100644
--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp b/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
index 188c62d93..c6eecc067 100644
--- a/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp
index 94cdfe010..44730d551 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp
index e045e3b54..e97aa433a 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_THREADWISE_GEMM_DLOPS_V3_HPP
 #define CK_THREADWISE_GEMM_DLOPS_V3_HPP
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp
index 0a1197a16..6774a35bc 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
index 16484ddcc..605f2569c 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp
index 6e8a23930..6a6c1f2ac 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
index f13da341f..bd01108b0 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
index 9c91cd9ca..6ec9abc41 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
index 68bc2726f..cf2c7a2ae 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
index 0f5fb88b0..b5847e51b 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp
index 2eb1b0ee9..db7dee219 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp
index 12ba2c538..eb6715e8e 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
index 24efeb2de..979f3567e 100644
--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 319487bc0..faaa2c5a9 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
index 5fc11d915..ea27a40ce 100644
--- a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
index 13d0a28cf..505ed33d5 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
index 1b5e64b66..cee3d2825 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
@@ -1,6 +1,6 @@
 
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/amd_address_space.hpp b/include/ck/utility/amd_address_space.hpp
index 9f1525914..d54f70e75 100644
--- a/include/ck/utility/amd_address_space.hpp
+++ b/include/ck/utility/amd_address_space.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp
index bdfb4f275..ef3d2032c 100644
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 #include "data_type.hpp"
diff --git a/include/ck/utility/amd_inline_asm.hpp b/include/ck/utility/amd_inline_asm.hpp
index 1f7df70bc..43baa817d 100644
--- a/include/ck/utility/amd_inline_asm.hpp
+++ b/include/ck/utility/amd_inline_asm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_AMD_INLINE_ASM_HPP
 #define CK_AMD_INLINE_ASM_HPP
diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp
index bf0914254..dd7f0b770 100644
--- a/include/ck/utility/amd_wmma.hpp
+++ b/include/ck/utility/amd_wmma.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_AMD_WMMA_HPP
 #define CK_AMD_WMMA_HPP
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index a742496fc..f4a04e281 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_AMD_XDLOPS_HPP
 #define CK_AMD_XDLOPS_HPP
diff --git a/include/ck/utility/array.hpp b/include/ck/utility/array.hpp
index 370a457fe..f63ce5e5a 100644
--- a/include/ck/utility/array.hpp
+++ b/include/ck/utility/array.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_ARRAY_HPP
 #define CK_ARRAY_HPP
diff --git a/include/ck/utility/array_multi_index.hpp b/include/ck/utility/array_multi_index.hpp
index 9b8d5b95e..c0c1ea65f 100644
--- a/include/ck/utility/array_multi_index.hpp
+++ b/include/ck/utility/array_multi_index.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_ARRAY_MULTI_INDEX_HPP
 #define CK_ARRAY_MULTI_INDEX_HPP
diff --git a/include/ck/utility/c_style_pointer_cast.hpp b/include/ck/utility/c_style_pointer_cast.hpp
index 6e8b00815..610e393a7 100644
--- a/include/ck/utility/c_style_pointer_cast.hpp
+++ b/include/ck/utility/c_style_pointer_cast.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_C_STYLE_POINTER_CAST_HPP
 #define CK_C_STYLE_POINTER_CAST_HPP
diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp
index 8da87c876..41a9d0b58 100644
--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/container_element_picker.hpp b/include/ck/utility/container_element_picker.hpp
index abc5185e0..838147e42 100644
--- a/include/ck/utility/container_element_picker.hpp
+++ b/include/ck/utility/container_element_picker.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_CONTAINER_ELEMENT_PICKER_HPP
 #define CK_CONTAINER_ELEMENT_PICKER_HPP
diff --git a/include/ck/utility/container_helper.hpp b/include/ck/utility/container_helper.hpp
index c8b02bc5a..9c7b95456 100644
--- a/include/ck/utility/container_helper.hpp
+++ b/include/ck/utility/container_helper.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_CONTAINER_HELPER_HPP
 #define CK_CONTAINER_HELPER_HPP
diff --git a/include/ck/utility/debug.hpp b/include/ck/utility/debug.hpp
index 593bbb711..80346f0d9 100644
--- a/include/ck/utility/debug.hpp
+++ b/include/ck/utility/debug.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef UTILITY_DEBUG_HPP
 #define UTILITY_DEBUG_HPP
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
index c6f0d299e..9ea0d6c00 100644
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/enable_if.hpp b/include/ck/utility/enable_if.hpp
index 297434b0d..c0a3c99f1 100644
--- a/include/ck/utility/enable_if.hpp
+++ b/include/ck/utility/enable_if.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/functional.hpp b/include/ck/utility/functional.hpp
index 08e730782..91797d240 100644
--- a/include/ck/utility/functional.hpp
+++ b/include/ck/utility/functional.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/functional2.hpp b/include/ck/utility/functional2.hpp
index 6f125ca4c..99c65f4eb 100644
--- a/include/ck/utility/functional2.hpp
+++ b/include/ck/utility/functional2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/functional3.hpp b/include/ck/utility/functional3.hpp
index 06b67ef7e..97605a7ad 100644
--- a/include/ck/utility/functional3.hpp
+++ b/include/ck/utility/functional3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/functional4.hpp b/include/ck/utility/functional4.hpp
index 6eeaf15c9..b5f3df8d7 100644
--- a/include/ck/utility/functional4.hpp
+++ b/include/ck/utility/functional4.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_FUNCTIONAL4_HPP
 #define CK_FUNCTIONAL4_HPP
diff --git a/include/ck/utility/generic_memory_space_atomic.hpp b/include/ck/utility/generic_memory_space_atomic.hpp
index 6a1ca9665..98f40a436 100644
--- a/include/ck/utility/generic_memory_space_atomic.hpp
+++ b/include/ck/utility/generic_memory_space_atomic.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 #include "data_type.hpp"
diff --git a/include/ck/utility/get_id.hpp b/include/ck/utility/get_id.hpp
index 44ff43815..77564c613 100644
--- a/include/ck/utility/get_id.hpp
+++ b/include/ck/utility/get_id.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/ignore.hpp b/include/ck/utility/ignore.hpp
index ac33cbf9a..f70a182fd 100644
--- a/include/ck/utility/ignore.hpp
+++ b/include/ck/utility/ignore.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/inner_product.hpp b/include/ck/utility/inner_product.hpp
index b65640bff..7828d21d7 100644
--- a/include/ck/utility/inner_product.hpp
+++ b/include/ck/utility/inner_product.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 #include "data_type.hpp"
diff --git a/include/ck/utility/integral_constant.hpp b/include/ck/utility/integral_constant.hpp
index 9aab4e242..376070eb3 100644
--- a/include/ck/utility/integral_constant.hpp
+++ b/include/ck/utility/integral_constant.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/is_known_at_compile_time.hpp b/include/ck/utility/is_known_at_compile_time.hpp
index 819815442..2cafc3e6f 100644
--- a/include/ck/utility/is_known_at_compile_time.hpp
+++ b/include/ck/utility/is_known_at_compile_time.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/magic_division.hpp b/include/ck/utility/magic_division.hpp
index a5e8e9216..f19030d4e 100644
--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/math.hpp b/include/ck/utility/math.hpp
index 72071992f..326b0e61e 100644
--- a/include/ck/utility/math.hpp
+++ b/include/ck/utility/math.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp
index a3732b2fe..1cac2cc0c 100644
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/multi_index.hpp b/include/ck/utility/multi_index.hpp
index 1d544c090..9f7ba8bff 100644
--- a/include/ck/utility/multi_index.hpp
+++ b/include/ck/utility/multi_index.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/number.hpp b/include/ck/utility/number.hpp
index f3ca6b61d..d29afd31a 100644
--- a/include/ck/utility/number.hpp
+++ b/include/ck/utility/number.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_NUMBER_HPP
 #define CK_NUMBER_HPP
diff --git a/include/ck/utility/reduction_common.hpp b/include/ck/utility/reduction_common.hpp
index aceef7b29..3777d297c 100644
--- a/include/ck/utility/reduction_common.hpp
+++ b/include/ck/utility/reduction_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/reduction_enums.hpp b/include/ck/utility/reduction_enums.hpp
index 678563310..23b7149f8 100644
--- a/include/ck/utility/reduction_enums.hpp
+++ b/include/ck/utility/reduction_enums.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/reduction_functions_accumulate.hpp b/include/ck/utility/reduction_functions_accumulate.hpp
index 724e5599d..b9765ff0d 100644
--- a/include/ck/utility/reduction_functions_accumulate.hpp
+++ b/include/ck/utility/reduction_functions_accumulate.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/reduction_operator.hpp b/include/ck/utility/reduction_operator.hpp
index b4e770a64..0f5b73cb0 100644
--- a/include/ck/utility/reduction_operator.hpp
+++ b/include/ck/utility/reduction_operator.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/sequence.hpp b/include/ck/utility/sequence.hpp
index 97b597221..d6bfb2eba 100644
--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/sequence_helper.hpp b/include/ck/utility/sequence_helper.hpp
index db25c27e7..8c493a282 100644
--- a/include/ck/utility/sequence_helper.hpp
+++ b/include/ck/utility/sequence_helper.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/span.hpp b/include/ck/utility/span.hpp
index 1e5012145..5e7567a84 100644
--- a/include/ck/utility/span.hpp
+++ b/include/ck/utility/span.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/static_buffer.hpp b/include/ck/utility/static_buffer.hpp
index dd25c9620..835f56573 100644
--- a/include/ck/utility/static_buffer.hpp
+++ b/include/ck/utility/static_buffer.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/statically_indexed_array.hpp b/include/ck/utility/statically_indexed_array.hpp
index 3438776f4..a2d70045a 100644
--- a/include/ck/utility/statically_indexed_array.hpp
+++ b/include/ck/utility/statically_indexed_array.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_STATICALLY_INDEXED_ARRAY_HPP
 #define CK_STATICALLY_INDEXED_ARRAY_HPP
diff --git a/include/ck/utility/statically_indexed_array_multi_index.hpp b/include/ck/utility/statically_indexed_array_multi_index.hpp
index 21b2941b2..4a8b96ae8 100644
--- a/include/ck/utility/statically_indexed_array_multi_index.hpp
+++ b/include/ck/utility/statically_indexed_array_multi_index.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_STATICALLY_INDEXED_ARRAY_MULTI_INDEX_HPP
 #define CK_STATICALLY_INDEXED_ARRAY_MULTI_INDEX_HPP
diff --git a/include/ck/utility/synchronization.hpp b/include/ck/utility/synchronization.hpp
index 0e247ed0f..775e7ac3a 100644
--- a/include/ck/utility/synchronization.hpp
+++ b/include/ck/utility/synchronization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/thread_group.hpp b/include/ck/utility/thread_group.hpp
index d469dec89..1cd6b2f3c 100644
--- a/include/ck/utility/thread_group.hpp
+++ b/include/ck/utility/thread_group.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/transpose_vectors.hpp b/include/ck/utility/transpose_vectors.hpp
index 2b0075d60..6faf5c133 100644
--- a/include/ck/utility/transpose_vectors.hpp
+++ b/include/ck/utility/transpose_vectors.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/tuple.hpp b/include/ck/utility/tuple.hpp
index d8664be55..b616b3123 100644
--- a/include/ck/utility/tuple.hpp
+++ b/include/ck/utility/tuple.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp
index 6f5b142a5..e39ae1c23 100644
--- a/include/ck/utility/tuple_helper.hpp
+++ b/include/ck/utility/tuple_helper.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/type.hpp b/include/ck/utility/type.hpp
index 90b9df295..9609afba4 100644
--- a/include/ck/utility/type.hpp
+++ b/include/ck/utility/type.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
index 46a1fa559..a1b1e0d91 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp
index 0b621e88a..a2eabdf5c 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp
index dd0db3168..20c1fcd73 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp
index 463c655ac..7d652fe4c 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
index b0149d88f..24f754e59 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
index 225f7b7e3..449734f43 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
index 7d62158f0..ec5df238a 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
index b8d47d218..8f4182a23 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
index be22003fd..71c84a1f5 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
index f949f27fd..0b90b4b50 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
index be69f297b..9b797be92 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp
index 28132aa1e..ce2a83da6 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
index fedd4dce6..6a48528c5 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
index 2bac5bc5c..9994a2f9f 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
index 3fc35a83c..b4b7a5a03 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp
index c04baca57..944f34007 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
index a4fd46c93..9916a03b9 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp
index b6a9b0fb5..f949260ca 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp
index df4fca656..0b7887efb 100644
--- a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef NAIVE_CONV_FWD_HPP
 #define NAIVE_CONV_FWD_HPP
diff --git a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
index 20df1b361..f57fed9c0 100644
--- a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index 188643952..851d9e497 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
index bb5f971c7..c3c8c0e5a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
index 0b025b33c..73f600425 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp
index 59d50e1bd..70bff2789 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
index 593ef7cb9..33653a308 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
index e1a4391c4..28ccf61a3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
index 8a0b1b1fa..0a30c210d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
index 34c86dd44..2ff64675c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp
index c84ffcff8..0e1f6f04e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp
index 8e40d60c1..8fd1c7665 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp
index 342ade69c..f6f4df7e2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
index 6a551c726..2ed8255f6 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
index fc9ec8d61..5d9567731 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
index 07d552476..1efe07366 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
index 2c529e06f..dd8c1987c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
index 8af400cb7..7e6267c87 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp
index 3d0c34062..b15139510 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp
index c87ae159b..a1c006cf6 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
index 732d98069..adac7d0dc 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
index 2d578cca4..99b2ad131 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp
index 554437f49..fd3550c2f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp
index c07ca3134..481915d00 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp
index 7beae83cd..de21f325a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
index 8f4cd4d96..f80efd551 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp
index fbc5df98a..09b1c2190 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
index e88844694..534875151 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
index 81b2b4fcf..fadfd1995 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
index ef6920e52..377ce083c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index fc4beb0ae..a82ec543c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
index e97484a5a..b482e97ee 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
index 199ed73b4..778f625d8 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
index 367180dea..239177529 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp b/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
index 44d89cf36..ccb5cb5a9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp b/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
index 88523c703..3a006b00a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp
index 2fd7ce22f..2ed4b0d5f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
index daec48050..8a96f6707 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
index b7d81021e..e17bea5fd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
index 2d54879ea..9236b5c79 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
index f278cfa22..7c1eb4e42 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
index 550a7b034..9930b1a6f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
index 2cdbfbb0c..c9c1475f1 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
index 4e3fa81f7..4dd5569ce 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp
index 7ca8bc258..d52310a3f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp
index 37398146b..025500764 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp
index 5eacd358c..2314e9498 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp
index 94ae02bf3..5c2bff16c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp
index e41e8de6a..a1279eecc 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp
index 99762aa64..aff6a2542 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp
index 1fc557a95..be8da2243 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp
index ca3ba4eb0..652984ae2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp
index 28a85782d..be60d1b32 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp
index ba7440079..27e3aa53e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp
index f5c813de7..f7f4870a9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp
index e25b6e849..790b5a92a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp
index a264d1126..ec3bc852e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp
index 8b1d8c95b..8c0c06567 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp
index 49a60d88c..8631495df 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp
index 04a7c2d23..59849c2d4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp
index d0feefb50..33cbea85e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp
index 35f35f202..386182957 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp
index 63eb7221b..f7c05b20a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp
index 1bca3c1f4..2dfecc6b0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp
index 1791a186f..a68793896 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp
index 3f56c057e..cf9c268a6 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp
index a3b8bcf9a..aeb578f94 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp
index 18e0e084d..73480262c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp
index 4a106463a..74293553e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp
index 23e1c49fe..8a91f76b3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp
index 62e2d24f0..0ff2c30f3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp
index 18a54d868..932299008 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp
index 9f408906a..c902e80a2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp
index c40052562..e7a136927 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp
index 532bfb417..2d1ab69fc 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
index 8c08e5ef2..9f782c11b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
index 0d08377a2..bb45295ed 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp
index 4cdd45e85..5814b84bb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp
index a36cafb27..f1a609a6f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp
index 13b078049..489bef1f5 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp
index 75e1f1024..2507d8230 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp
index 00ec17fad..df4b7b7b4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp
index 7b762bc93..2748b9b9d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp
index 2a2b284b2..26230d545 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp
index 444d8ddc8..d5e1499ae 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp
index f3c070176..cbc8befb3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp
index c57edd084..574cc3dd3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
index 325ed1e6d..04bab9d28 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp
index 8960ba7c5..6c50d222e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp
index 95d9c0726..5e91f53b0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp
index dd6734061..171803156 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp
index 85f75110d..03bff5912 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp
index 7f62f4e01..676af1933 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp
index eee771b13..c67c0a41f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp
index 64f1e9c22..c9058a9b1 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp
index 078561e15..1fbd3c6e7 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp
index 5a9144186..89496a4ed 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp
index dc4740aa3..fb79ce8ad 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp
index 9ecc96797..7915bd263 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp
index ccce78e2f..d769fe815 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp
index 6d3749d86..49938d443 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp
index 7594dde74..810249994 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp
index 3272e7f9a..6640ffec7 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp
index 519ec8271..441bea275 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp
index 77b2fb930..05912e1e9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp
index 5abb5c5ee..b0e16cfea 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp
index 23bd988b8..0ee0f7d0f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp
index 7ce5577d7..feb9d99eb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp
index 7e4c5b77f..a9f0d77cb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp
index 5eca5fea7..bd1f72250 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
index b0e98411b..60189fd30 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
index 84609a995..b5aa9ce61 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
index 2f816bb11..0e20a17ff 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
index 9cecd4a5b..c8c1cd7e8 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
index 42e9b7fc7..3e71b4467 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
index 494f1c3d7..c8286e56d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
index a80abb924..36fbc52f1 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
index 53fd28638..0900cffa3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
index df5a4db48..726b56066 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
index ed78acd92..0ae983860 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp
index 0038fc26d..a3a39b7cc 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
index 36eb092f0..c5c2d2cdd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
index 83f52fc3e..7c6f189cb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
index 046ff5780..33d5cc683 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
index 8e6a226f6..7668248c3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
index 518fa5f98..20eb7bbc9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
index 10016cdd7..e8356a929 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
index cdd5a3cd7..b3f7d4890 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
index a8be272e0..4190f50a3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
index ec8296ff2..b7f334490 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp
index b3877c4bb..53c142f61 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <tuple>
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
index a6d9a359f..41c67af7a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
index 6621a2c86..2d791ff97 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
index 3dfac98ed..eb9cc1ee2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
index 6d2a0c932..68af443a5 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
index 97dd3dcb1..3bf8704b4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
index 58f8760ac..43e54aaca 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
index df8d31f0d..32c4cd74b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
index 1bd773227..f8f5caddb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
index 16f129d2d..a034e41a0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <tuple>
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
index f80f712ff..3cd374209 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
index 6f9952e7d..f7d4dd045 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
index 2cbd13a1b..c49dd4d85 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
index 7b12522a8..4074ee3b1 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
index 54d477f80..479fcc92f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
index 4ffc44e3a..0dd644fab 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
index 08cbb8127..50f39396a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
index 187d034b9..defa2dbda 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
index 7fc9ed691..6ff07de23 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <tuple>
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
index 03be6e2bc..206980cf1 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/algorithm.hpp b/library/include/ck/library/utility/algorithm.hpp
index 86f04dd36..57136f8a2 100644
--- a/library/include/ck/library/utility/algorithm.hpp
+++ b/library/include/ck/library/utility/algorithm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp
index a89d03d32..7f63a81a0 100644
--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/conv_common.hpp b/library/include/ck/library/utility/conv_common.hpp
index 6fad9f7d7..085454f42 100644
--- a/library/include/ck/library/utility/conv_common.hpp
+++ b/library/include/ck/library/utility/conv_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp b/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
index 2b4f63b28..ff697fb71 100644
--- a/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
+++ b/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/convolution_parameter.hpp b/library/include/ck/library/utility/convolution_parameter.hpp
index f4a2b56f7..df6efca10 100644
--- a/library/include/ck/library/utility/convolution_parameter.hpp
+++ b/library/include/ck/library/utility/convolution_parameter.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/device_memory.hpp b/library/include/ck/library/utility/device_memory.hpp
index 87940e167..1c16ff591 100644
--- a/library/include/ck/library/utility/device_memory.hpp
+++ b/library/include/ck/library/utility/device_memory.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/fill.hpp b/library/include/ck/library/utility/fill.hpp
index c0bc37276..c01e139ea 100644
--- a/library/include/ck/library/utility/fill.hpp
+++ b/library/include/ck/library/utility/fill.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/host_common_util.hpp b/library/include/ck/library/utility/host_common_util.hpp
index 6f4466e8d..20a8f234d 100644
--- a/library/include/ck/library/utility/host_common_util.hpp
+++ b/library/include/ck/library/utility/host_common_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/host_gemm.hpp b/library/include/ck/library/utility/host_gemm.hpp
index 44036d023..5eb7e3b8c 100644
--- a/library/include/ck/library/utility/host_gemm.hpp
+++ b/library/include/ck/library/utility/host_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/host_tensor.hpp b/library/include/ck/library/utility/host_tensor.hpp
index 844c29ed1..91293d29f 100644
--- a/library/include/ck/library/utility/host_tensor.hpp
+++ b/library/include/ck/library/utility/host_tensor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/host_tensor_generator.hpp b/library/include/ck/library/utility/host_tensor_generator.hpp
index 4259862e6..31ff13aec 100644
--- a/library/include/ck/library/utility/host_tensor_generator.hpp
+++ b/library/include/ck/library/utility/host_tensor_generator.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/iterator.hpp b/library/include/ck/library/utility/iterator.hpp
index 9fdc88ea7..b44e2d8e3 100644
--- a/library/include/ck/library/utility/iterator.hpp
+++ b/library/include/ck/library/utility/iterator.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/literals.hpp b/library/include/ck/library/utility/literals.hpp
index a73a2ea05..a8bd6303f 100644
--- a/library/include/ck/library/utility/literals.hpp
+++ b/library/include/ck/library/utility/literals.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/numeric.hpp b/library/include/ck/library/utility/numeric.hpp
index 70a7e87ab..9ee118d47 100644
--- a/library/include/ck/library/utility/numeric.hpp
+++ b/library/include/ck/library/utility/numeric.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/ranges.hpp b/library/include/ck/library/utility/ranges.hpp
index 55c322f1a..f11e4204a 100644
--- a/library/include/ck/library/utility/ranges.hpp
+++ b/library/include/ck/library/utility/ranges.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
index cc8787458..e730e9f58 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
index 04200cfb5..f6696ffa9 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
index 7b86f3cc7..32d6f258b 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
index 2afb1afbc..ee246ba56 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
index 68d768949..5a9483b30 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
index 737e5bfca..0fa071923 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
index e09d01736..2f42f62b0 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
index 984d66e28..10b4cea7d 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
index 12cada9c4..c687eb20f 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
index 13f198862..b19374ca6 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
index 2ca1adc2f..bbd318ba9 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
index fe5de5279..187ccb5ff 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
index 5b55c8e15..ec2b2646f 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
index 9517e4577..d76cd350c 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
index 43b912440..ef65106c2 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
index 326500fcb..078b241f9 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index e1bfa88f4..4db05589b 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
index f59b74253..e25f903a8 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/device_batched_gemm_bias_permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/device_batched_gemm_bias_permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp
index 04a748f45..a0afaabbc 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/device_batched_gemm_bias_permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/device_batched_gemm_bias_permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index 9b96194c8..67dfc4cd3 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
index 0713dfcd9..9001c901c 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
index 521c3d921..cb89d3cef 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
index 231d612d7..91eefba0c 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
index 165bc3957..c20798f55 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
index 832fc3b06..3d9ad64b9 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index 99e871247..cf23d01bf 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
index f73e3dea8..a541a2d22 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index fd0947195..5e481603c 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
index 53ad7ba5f..501ea85f6 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index 291a127a6..91ab541bf 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp
index b62c8b99c..bc95d2f1b 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp
index d05b8b592..fbc8d0bc6 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp
index e3ef95d12..bed38658a 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp
index 41be396c2..fc5ec77e4 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp
index cd1e05b11..4e38ee13b 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp
index 073dd583f..f087eb798 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp
index be63bd44c..d0f361401 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp
index fe87091e8..710d07b82 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp
index 2e695afa9..8801c309f 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/utility/tuple.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp
index 9ec761e44..b674cfc42 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/utility/tuple.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp
index f0d26c36b..05e365088 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/utility/tuple.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp
index 9e4066bb0..15a02af02 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/utility/tuple.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
index ebbff8834..5587db77e 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
index 980383f3e..26262855e 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
index 2d4b6e348..befc0dcd1 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
index 7caa469f5..e45b47cf9 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
index 093b2f0e9..f437a227d 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
index 0f683e5c2..13fdbeb35 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
index e384993ae..95ef8c492 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
index 92e39c173..290f81d7c 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
index 5118d0d03..16fd1cb40 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
index 655d4f006..ff37bf7cc 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
index a9d20be18..8a1f6f933 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
index a68f5c971..d333f5972 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
index 0aa927155..4c87b51a9 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
index b84ea274c..fd3f57c6b 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
index 578469997..1e53f0b2f 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
index 8e5a19313..d02d146a9 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
index 5a5c83842..e3e90c966 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
index e0f3d6199..81e9122d9 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
index 30537d937..dbc82168f 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
index 190c39b87..3ac250f3e 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
index e14cd5586..fcb858728 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
index f001b83c1..2baa6ae06 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
index 83ba6a1c6..28867fe19 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
index 1da9a81d9..40656e382 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
index 7c33df5e7..422e37e92 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
index a5f8629f2..5993f6bd7 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
index 8076d6d35..bb9b69686 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
index 33503b9f8..da96c79a6 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
index c5e4bd199..78e9c893c 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
index f43d13e30..4663a9ac3 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
index 0ce6b04c4..0b1df52f8 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
index 76ab3189d..9969a8bbc 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
index f8c255088..e34ea06ff 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
index fe7152471..3254fcfc2 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
index 04ce7c076..94b2a47e5 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
index 0251d9157..4244ab7b8 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
index c2975727e..5c7db4ca3 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
index fc86d7302..ebc56487a 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
index 182037f15..a62c9e235 100644
--- a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp
index b160d4fe1..3e2386ee0 100644
--- a/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
index 35df85b7a..ea99a5a30 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
index 7d0863c95..b83acfa8c 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
index c1b11b19d..d5800e033 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
index 9a889e8d5..abe52ce1d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
index d8e779843..e696bfdcd 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
index 0034ac59c..d3ad7c60e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
index 0b540b8b3..a56a36b0a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
index 4f6ff5111..63d55e81d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
index 9f5cebcab..3d9a265c2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
index 41afb519f..240384d19 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
index d1173095f..e96905247 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
index efda345a8..124b818b2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
index 03eebf4ec..2e884dfc8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
index 5d8de04cd..2ca29b1e6 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp
index 7b12b7cf1..706076098 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
index 730ffd463..5ac458a7b 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
index 619473ff0..a64412544 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
index 8e06f9d26..44b684823 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
index f9458b748..23176269c 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
index 77a03b746..31a9abe53 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
index ef8d7d4e4..201fd9311 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp
index cb65cc7b6..5d489b207 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp
index 5b1014ed8..e09480d57 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp
index e6f6add8b..34065c334 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
index 80b3d03da..95d7777a7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
index 93b3df1e5..bf24bc76b 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
index f10365d89..023f98712 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
index a7a9eb62c..ffb199e58 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
index 9fb45b003..90e979d89 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
index 18a78674e..8a81b7789 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
index cef6070af..e1983add0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
index 1be70d6ca..47a180e12 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
index 6b8455ffa..b8e994e91 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
index b9e28e3d7..a590413ac 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
index 2b1a5a57b..1d010d1b0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
index 301d3b55b..f108b7534 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
index cd16f35ff..b0b4bc012 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
index 391666984..df3bd94fc 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
index 0a623034e..73b4e7766 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
index 5ef8d08de..76137a1c3 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
index c9557bae8..f0158d8f3 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
index 463e0865c..7b65f8737 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
index b71ff1b99..a9fef5c60 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
index 9060c9b1b..c3b1dfcca 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
index 81cf01d6a..8338b34a4 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
index 4da85cc46..357dc91aa 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
index ab83e4baa..65b94d087 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
index a4cd3fadb..00cd07bab 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
index 207e76ffe..9955a206b 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
index 3f30937ff..d9ae78179 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
index d91e6c63b..27c6cbe8a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
index 143321542..ff5f0e94a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
index 09acc7c0f..dba625e0a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp
index 47b8d2342..28a452c1a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp
index efa030ec4..13366238d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp
index f2735020e..8a4889ee8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp
index 7d4aae928..fc3cbcf90 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
index e8747af48..bfb95bce8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
index ed54c3a9b..d0352339c 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
index da7eae637..d5b298ab2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
index 34345095e..80c8f018f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
index 55461dfba..74ec9e1f8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
index 405e69975..eb98b3e7e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
index 9af31b3a1..5f4a90125 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
index 841b7a1d4..38e3897d6 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
index 9f7f643be..c5aa59f91 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
index c8e9f35d2..e71b269b8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
index 5f804d45a..fdf63f811 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
index 60cb138f5..4c98a9d5e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
index 59e2b2da8..f0e7b6ab4 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
index bb09bf8b8..56815b9ac 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
index 0a3b566de..e66d46a26 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
index 2b17e47b1..fb1dfac69 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp
index e178d3b0a..fed2cbbfb 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp
index 52be9fe70..44ac4c08c 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
index c4680db83..30a2bf36d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
index 7fc35c419..4b2a2dbdc 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp
index f27b2199e..9d15ccd36 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp
index b9a109557..4e9ad5874 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp
index 44e5f597d..330e5aff9 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp
index f3a9063f7..0db3a15d2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
index 05ba44924..ccbfaeaf4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
index 7a610a747..e10de67f9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
index 90e074f05..8b47e82f6 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
index 74aebf103..5aa50adb3 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
index 361ea8f4e..333b40c71 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
index 3145b7164..506a93ae9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp
index cde93f902..30084f16d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index 3d604d42c..11babea28 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
index ede21f1f4..c2c0fc553 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index 99e556618..5be7443ec 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index 15871a28c..2828b432f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_common.hpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_common.hpp
index b4de825fb..85a7a5be2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index f7e575df2..5eb881549 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_dl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index 85300b4e4..4157853c4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_dl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_instance.hpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_instance.hpp
index bcda22006..3d3f9b179 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
 #include "device_grouped_conv2d_fwd_common.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
index 40593a0ef..bc7d577b6 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index 7088028bf..55bf6da9b 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index 919274c50..202cdd6b4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_instance.hpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_instance.hpp
index 2858671ee..07bea1c03 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
 #include "device_grouped_conv2d_fwd_common.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index 25caf61df..82edf896d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index b997cfb67..4bd3236ca 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
index 3256a2a82..4f5bdb202 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
index e48db4a53..7ae87eed5 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
index 1655850ec..ab07341b5 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
index aba46b7eb..15045bedd 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
index b4ae8b6ce..c3b100ea6 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
index 061674bd8..ca488e9dc 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
index ed7e54767..6087b1b18 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp
index bf5fa3060..fd8c47deb 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp
index 8c3849373..5e5dbc53c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instance.cpp
index 487cd2272..a88fe4af6 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instance.cpp
index d497cd57e..f6e7e5b28 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp
index 2e53fbbda..3d303a3fa 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
index b550bb287..aa161e51c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
index a3f9c7a9e..c454deac1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
index 5f5d6c9b5..c829e8863 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
index 2ace1b243..fb30e7a97 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
index 764ec0619..8642562fa 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
index a3d73440e..83b31b07c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
index cffb0fce1..aa6365cd9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
index dddfa2aa4..f4460b360 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp
index c2f5f00c7..f4086b6ea 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp
index 476d4ce1f..d68eb7614 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
index 1023fa481..2dfb8caac 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
index 6b065c0f8..598a0b0e2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
index e9c2112e1..be860f58e 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
index 79dde38fc..9a64e555d 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
index 9f6bf128f..fe72a2733 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
index 6241e0338..cac8641e1 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
index b64328d5d..0a9ac8462 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
index d6a2f6f2c..ad92818ec 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
index 73097828e..70e3bbc1c 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
index 507a683ee..7c5d2c4a9 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
index ca1aa0c25..f5626d4a9 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
index 9dea41e89..d9029ac25 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
index 38338ff99..508ad3873 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
index 0f4a35dee..ada96a93a 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
index 6fcb519a9..62bcad992 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
index 67ffd4708..47896be91 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
index a41cd0094..35c8522d9 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
index fa70569ba..75b7629f2 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
index f3367b946..dbfc4acfd 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
index 8477a884d..63b3e8df8 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp b/library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp
index cd508b55b..8aa707885 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
index 672cdba65..711314985 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
index d4b5484d8..39c4f82fe 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_dl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
index 7db4b8d86..92e73eb2e 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_dl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp
index 7eefbe038..bb7a570cd 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
index c8f5f7042..1d8b58fd1 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_dl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
index d7f7384ff..62826d0d2 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_dl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
index 658aa8370..99877c32d 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_xdl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
index 7102e9b25..50ccc69f4 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_xdl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp
index 90f8791aa..caced6c95 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
index 9d6937708..526fe7346 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_xdl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
index d6f87335b..d1d5a66bb 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_xdl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp
index 9cad8d4c8..0ec498a44 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_quantization_common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
index ffe1efb80..4c02f8179 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
index 7f24e5677..2c635e96d 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
index 06e66cfe0..c5a52f2cf 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
index 16635d1e9..fe4a0b4d6 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp
index dfb8dc29b..d1b18ac49 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_quantization_common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
index c153cdf9e..e3163c32e 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
index f6cd32026..f6a0163ab 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
index 45fbacc33..0cfffbed7 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
index 257633fe1..289898257 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp
index 213f42b91..e7c2500fe 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp
index cf46059a0..4771b4aa5 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp
index 0043b1984..f684dabc1 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp
index 6f702ddf1..86e2ca7c4 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp
index d1f70dc99..97d09ff83 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp
index a957981a5..87187f493 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp
index 550a9cd76..d3f5cd865 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp
index 58cb6ee34..fb4a50d5e 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp
index 1ac5e79bc..f74d22f7a 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp
index b1e1a0680..0982455ad 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp
index 1a15b32d2..c6f1c3a63 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp
index 119f384b4..db9f52a13 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp
index 3f1bd86b8..1410f51be 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp
index b507f0d1f..707a65570 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp
index 04d0ea2e8..17f45c332 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp
index 3de561f2b..d3753d01c 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp
index 3f45b0313..83fad7826 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp
index 76851d9b7..c627f6863 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp
index 9cef01932..09686d98f 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp
index ce73ec47e..44519032f 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp
index ed6091f92..9e14d80e2 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp
index 4c8375de1..ab217f116 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp
index 0fa93ab68..3f611da7e 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp
index 821eec175..8695db400 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp
index 0305b4945..8d15fda78 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp
index 1bda0bcc7..c161858b1 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp
index 7f8018a04..e6b87fdea 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp
index 887a89cc2..c2e4acc8d 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp
index 0cc810363..e5a5e07e1 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp
index 4c825a9f1..22c620334 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp
index bf26913fd..1890fb78c 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp
index 629299c7b..dc5caf297 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp
index 9a0863449..02bb0c3b1 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp
index 6dc925bd6..0b0a40303 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp
index 470d68d37..fcf95aab7 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp
index 39303ab58..5151d32a5 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp
index a5481784e..5463cc2b4 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp
index aa6e6d3cc..a15e99fe0 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp
index b1ea551ea..ee7a4dd4c 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp
index 2ba83132d..58f7295a8 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp
index b9018e7c6..43f2370a2 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp
index 1abc6de55..eaf84a6de 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp
index f3a017aeb..76d63c9a9 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp
index 329617bb4..eb1d389c3 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp
index 1e4d43deb..61225df05 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp
index f9f79675f..aef231e17 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp
index d3e7268c5..6d2939e3e 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp
index a41a12386..ff0ddd773 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp
index 6da1acc4e..136702206 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp
index f14b8a403..f0f930191 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp
index 5a9f08167..db9f15660 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp
index d3aff0675..e5644faf2 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp
index 55f7537d8..350259f5f 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp
index 70f91168d..89cae5527 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp
index 47f5e67fe..b0f82ab45 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp
index eae489ff0..fedd0f890 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp
index 9fb267a20..64ec1ce15 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp
index fecb2691f..156f182f5 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp
index 232d2b858..22b278790 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp
index 07d45c4ca..f3070d297 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp
index 596a062f3..e17476ba7 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp
index 7270cefe8..2387725b9 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp
index d0f4ef3df..8be94118c 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp
index 9c6bce92f..9ff01615c 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp
index 5faf8d828..8a99c7c95 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
index 8f3c72451..78909c8db 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
index 8d7794f42..7f094c28b 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp
index 4a32543a1..4dfb477c1 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp
index 26d571c84..e49283360 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
index ae56a2a91..de8dce357 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp
index aae3233c9..0225e75cb 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
index 94d5d3fa2..6d04d2128 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
index dad190a63..046643abe 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
index b7ca6998f..a61302c00 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
index 22c40187e..7619e12a8 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
index 14d276452..a86da7cc7 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
index fa334b997..938fb033a 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
index 1c9d37d84..3d5659381 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
index 5fbdab505..d701b4174 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
index 7dd8640b1..2085aafc5 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
index b32fe6838..ebe4329f9 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
index c05048ec5..b8fd5a1e5 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
index 6a235708b..112f1940d 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
index e5bec5e26..ab8a69eec 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
index 57d3f184a..5382fec90 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
index fae3a4dd6..a1a143afa 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
index b6fb70e8e..992e0c1ec 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
index 33c7b6f35..2be1f45bb 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
index c22aa574b..a1da73aa8 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
index 55f3d2bd2..b5c3b576a 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
index fb0bcf5ee..22a0404c0 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
index 608cfcf83..81a2ff80c 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
index 15552dbae..3e2cf8d06 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
index 676740288..c8b038d50 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
index 4b33da93c..08995d99e 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
index fe3b823e8..652601ee7 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
index 8ecdf87d9..86caac1b6 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
index 356313520..c46ae1a4e 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
index aa21a0bf8..394814ff5 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/utility/convolution_parameter.cpp b/library/src/utility/convolution_parameter.cpp
index c8712d209..57cedd601 100644
--- a/library/src/utility/convolution_parameter.cpp
+++ b/library/src/utility/convolution_parameter.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/host_utility/io.hpp"
 
diff --git a/library/src/utility/device_memory.cpp b/library/src/utility/device_memory.cpp
index 90f943313..11166783e 100644
--- a/library/src/utility/device_memory.cpp
+++ b/library/src/utility/device_memory.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/host_utility/hip_check_error.hpp"
 
diff --git a/library/src/utility/host_tensor.cpp b/library/src/utility/host_tensor.cpp
index e34fbc8f3..721155264 100644
--- a/library/src/utility/host_tensor.cpp
+++ b/library/src/utility/host_tensor.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cassert>
 
diff --git a/profiler/include/profiler/data_type_enum.hpp b/profiler/include/profiler/data_type_enum.hpp
index afcd6fea2..c046c7fab 100644
--- a/profiler/include/profiler/data_type_enum.hpp
+++ b/profiler/include/profiler/data_type_enum.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp b/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
index b16254279..22dab3110 100644
--- a/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
index 799dccc0f..5bee67c1c 100644
--- a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
index 1583c6db2..f3d2c5561 100644
--- a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batched_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_impl.hpp
index c07d7c055..cdc94aa9a 100644
--- a/profiler/include/profiler/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
index 45b7b7738..901fa338d 100644
--- a/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
index f5ec23514..15a21206c 100644
--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
index 91c28f25f..f2fcb0b13 100644
--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
index 79d886208..3343b5e66 100644
--- a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
index 82fe75bf0..2f9538b16 100644
--- a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
index ca6533934..1b31a2aab 100644
--- a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
index 86d394daf..52152a90f 100644
--- a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp b/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp
index 1aebef8bb..436fbdbd7 100644
--- a/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp
+++ b/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp b/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp
index 2bac14433..808c1a1c9 100644
--- a/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp
+++ b/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_conv_fwd_impl.hpp b/profiler/include/profiler/profile_conv_fwd_impl.hpp
index 1f3ba8f00..bc2eb2579 100644
--- a/profiler/include/profiler/profile_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_conv_fwd_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
index 7707e16b0..1fd9c8110 100644
--- a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
+++ b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp b/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp
index 3cc2ea3b9..81b8d8ddb 100644
--- a/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp b/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp
index d53a6589e..6f6d881c1 100644
--- a/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp b/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp
index 40093e774..25871dfb2 100644
--- a/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp b/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp
index e1c90f0f5..4c3d0a045 100644
--- a/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
index b4ec78cdf..c0ffea8a3 100644
--- a/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_bilinear_impl.hpp b/profiler/include/profiler/profile_gemm_bilinear_impl.hpp
index 31bae281c..b540e938b 100644
--- a/profiler/include/profiler/profile_gemm_bilinear_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_bilinear_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp b/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp
index f9a544c04..3893f8cdc 100644
--- a/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_impl.hpp b/profiler/include/profiler/profile_gemm_impl.hpp
index 9b164104b..eaab5dbcc 100644
--- a/profiler/include/profiler/profile_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_reduce_impl.hpp
index 370121a3c..ff801e8af 100644
--- a/profiler/include/profiler/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_reduce_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_splitk_impl.hpp b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
index ab1bce258..6ffa31678 100644
--- a/profiler/include/profiler/profile_gemm_splitk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index 4f9aa9837..dc6739773 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
index b201a2ed3..9fadfe969 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_grouped_gemm_fastgelu_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_fastgelu_impl.hpp
index 87e6ae44c..f05b13b74 100644
--- a/profiler/include/profiler/profile_grouped_gemm_fastgelu_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_fastgelu_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
index 9abb5e7a5..09a651d77 100644
--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_groupnorm_impl.hpp b/profiler/include/profiler/profile_groupnorm_impl.hpp
index 73343f6be..ebefe3dad 100644
--- a/profiler/include/profiler/profile_groupnorm_impl.hpp
+++ b/profiler/include/profiler/profile_groupnorm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_layernorm_impl.hpp b/profiler/include/profiler/profile_layernorm_impl.hpp
index 7dd90d079..2d87c8c8f 100644
--- a/profiler/include/profiler/profile_layernorm_impl.hpp
+++ b/profiler/include/profiler/profile_layernorm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
index c313a00be..0c888db1f 100644
--- a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
index c9e4c193f..41b57fd85 100644
--- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_reduce_impl.hpp b/profiler/include/profiler/profile_reduce_impl.hpp
index e61820029..b54aa65ae 100644
--- a/profiler/include/profiler/profile_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_reduce_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_softmax_impl.hpp b/profiler/include/profiler/profile_softmax_impl.hpp
index 96816f53b..65b4be2a6 100644
--- a/profiler/include/profiler/profile_softmax_impl.hpp
+++ b/profiler/include/profiler/profile_softmax_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/src/profile_avg_pool2d_fwd.cpp b/profiler/src/profile_avg_pool2d_fwd.cpp
index b92288096..c67897c04 100644
--- a/profiler/src/profile_avg_pool2d_fwd.cpp
+++ b/profiler/src/profile_avg_pool2d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profile_batched_gemm.cpp b/profiler/src/profile_batched_gemm.cpp
index 907a37379..dc83e25b4 100644
--- a/profiler/src/profile_batched_gemm.cpp
+++ b/profiler/src/profile_batched_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdint>
 #include <iostream>
diff --git a/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp b/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp
index f440a3094..3d29c4b84 100644
--- a/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp
+++ b/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_batched_gemm_gemm.cpp b/profiler/src/profile_batched_gemm_gemm.cpp
index 6015c93be..9a99874d1 100644
--- a/profiler/src/profile_batched_gemm_gemm.cpp
+++ b/profiler/src/profile_batched_gemm_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_batched_gemm_reduce.cpp b/profiler/src/profile_batched_gemm_reduce.cpp
index 6b1dfc014..9620d63ca 100644
--- a/profiler/src/profile_batched_gemm_reduce.cpp
+++ b/profiler/src/profile_batched_gemm_reduce.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_batchnorm_bwd.cpp b/profiler/src/profile_batchnorm_bwd.cpp
index 44ce7350f..1738d53db 100644
--- a/profiler/src/profile_batchnorm_bwd.cpp
+++ b/profiler/src/profile_batchnorm_bwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profile_batchnorm_fwd.cpp b/profiler/src/profile_batchnorm_fwd.cpp
index 902a1fc42..2b3e4eea4 100644
--- a/profiler/src/profile_batchnorm_fwd.cpp
+++ b/profiler/src/profile_batchnorm_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profile_batchnorm_infer.cpp b/profiler/src/profile_batchnorm_infer.cpp
index 92c16859c..f1c19bc36 100644
--- a/profiler/src/profile_batchnorm_infer.cpp
+++ b/profiler/src/profile_batchnorm_infer.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profile_conv_bwd_data.cpp b/profiler/src/profile_conv_bwd_data.cpp
index 9241ead73..465abacc4 100644
--- a/profiler/src/profile_conv_bwd_data.cpp
+++ b/profiler/src/profile_conv_bwd_data.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_conv_fwd.cpp b/profiler/src/profile_conv_fwd.cpp
index b57ee7fd9..701999d8a 100644
--- a/profiler/src/profile_conv_fwd.cpp
+++ b/profiler/src/profile_conv_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_conv_fwd_bias_relu.cpp b/profiler/src/profile_conv_fwd_bias_relu.cpp
index b44007cde..31055ec1d 100644
--- a/profiler/src/profile_conv_fwd_bias_relu.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_conv_fwd_bias_relu_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
index 408dd02f7..8c2439a0c 100644
--- a/profiler/src/profile_conv_fwd_bias_relu_add.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm.cpp b/profiler/src/profile_gemm.cpp
index 61bae6ae7..b3587ea98 100644
--- a/profiler/src/profile_gemm.cpp
+++ b/profiler/src/profile_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_add_add_fastgelu.cpp b/profiler/src/profile_gemm_add_add_fastgelu.cpp
index c3c0fb7b6..8af3768a4 100644
--- a/profiler/src/profile_gemm_add_add_fastgelu.cpp
+++ b/profiler/src/profile_gemm_add_add_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_add_fastgelu.cpp b/profiler/src/profile_gemm_add_fastgelu.cpp
index 380b25a61..a09bb8340 100644
--- a/profiler/src/profile_gemm_add_fastgelu.cpp
+++ b/profiler/src/profile_gemm_add_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_add_multiply.cpp b/profiler/src/profile_gemm_add_multiply.cpp
index 7d6fead40..560467c26 100644
--- a/profiler/src/profile_gemm_add_multiply.cpp
+++ b/profiler/src/profile_gemm_add_multiply.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_add_relu_add_layernorm.cpp b/profiler/src/profile_gemm_add_relu_add_layernorm.cpp
index 5cbc3d21f..558d255ce 100644
--- a/profiler/src/profile_gemm_add_relu_add_layernorm.cpp
+++ b/profiler/src/profile_gemm_add_relu_add_layernorm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_bias_add_reduce.cpp b/profiler/src/profile_gemm_bias_add_reduce.cpp
index 6d86db082..76daffbc6 100644
--- a/profiler/src/profile_gemm_bias_add_reduce.cpp
+++ b/profiler/src/profile_gemm_bias_add_reduce.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_bilinear.cpp b/profiler/src/profile_gemm_bilinear.cpp
index 3480014ba..a1a48616b 100644
--- a/profiler/src/profile_gemm_bilinear.cpp
+++ b/profiler/src/profile_gemm_bilinear.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_fastgelu.cpp b/profiler/src/profile_gemm_fastgelu.cpp
index 2a137224c..93573002e 100644
--- a/profiler/src/profile_gemm_fastgelu.cpp
+++ b/profiler/src/profile_gemm_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_reduce.cpp b/profiler/src/profile_gemm_reduce.cpp
index 395bf0627..48f6f5eb4 100644
--- a/profiler/src/profile_gemm_reduce.cpp
+++ b/profiler/src/profile_gemm_reduce.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_splitk.cpp b/profiler/src/profile_gemm_splitk.cpp
index f636ce718..cc2da73cb 100644
--- a/profiler/src/profile_gemm_splitk.cpp
+++ b/profiler/src/profile_gemm_splitk.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/profiler/src/profile_grouped_conv_bwd_weight.cpp
index dfd8a099f..7a062ed51 100644
--- a/profiler/src/profile_grouped_conv_bwd_weight.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <initializer_list>
diff --git a/profiler/src/profile_grouped_conv_fwd.cpp b/profiler/src/profile_grouped_conv_fwd.cpp
index 9ff3c15af..d0b424cde 100644
--- a/profiler/src/profile_grouped_conv_fwd.cpp
+++ b/profiler/src/profile_grouped_conv_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp
index 34647adab..d023db54d 100644
--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_grouped_gemm_fastgelu.cpp b/profiler/src/profile_grouped_gemm_fastgelu.cpp
index 9b6142f01..50ecf25ca 100644
--- a/profiler/src/profile_grouped_gemm_fastgelu.cpp
+++ b/profiler/src/profile_grouped_gemm_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_groupnorm.cpp b/profiler/src/profile_groupnorm.cpp
index d55529a0f..d55784ff0 100644
--- a/profiler/src/profile_groupnorm.cpp
+++ b/profiler/src/profile_groupnorm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profile_layernorm.cpp b/profiler/src/profile_layernorm.cpp
index e93fc2dbd..7bf210e67 100644
--- a/profiler/src/profile_layernorm.cpp
+++ b/profiler/src/profile_layernorm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profile_max_pool3d_fwd.cpp b/profiler/src/profile_max_pool3d_fwd.cpp
index 90c6e4e2b..cf6db2cfc 100644
--- a/profiler/src/profile_max_pool3d_fwd.cpp
+++ b/profiler/src/profile_max_pool3d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp
index 692537185..e4af5680a 100644
--- a/profiler/src/profile_reduce.cpp
+++ b/profiler/src/profile_reduce.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <fstream>
diff --git a/profiler/src/profile_softmax.cpp b/profiler/src/profile_softmax.cpp
index 78b64dda7..77007ad13 100644
--- a/profiler/src/profile_softmax.cpp
+++ b/profiler/src/profile_softmax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profiler.cpp b/profiler/src/profiler.cpp
index 080117e39..0f528c008 100644
--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/profiler/src/profiler_operation_registry.hpp b/profiler/src/profiler_operation_registry.hpp
index 91ff29123..276b7b38d 100644
--- a/profiler/src/profiler_operation_registry.hpp
+++ b/profiler/src/profiler_operation_registry.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <functional>
 #include <iostream>
diff --git a/test/batched_gemm/batched_gemm_bf16.cpp b/test/batched_gemm/batched_gemm_bf16.cpp
index 78be54062..fa1652e99 100644
--- a/test/batched_gemm/batched_gemm_bf16.cpp
+++ b/test/batched_gemm/batched_gemm_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm/batched_gemm_fp16.cpp b/test/batched_gemm/batched_gemm_fp16.cpp
index 6cbbedf67..3df4912aa 100644
--- a/test/batched_gemm/batched_gemm_fp16.cpp
+++ b/test/batched_gemm/batched_gemm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm/batched_gemm_fp32.cpp b/test/batched_gemm/batched_gemm_fp32.cpp
index c9e565e26..2d808441b 100644
--- a/test/batched_gemm/batched_gemm_fp32.cpp
+++ b/test/batched_gemm/batched_gemm_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm/batched_gemm_int8.cpp b/test/batched_gemm/batched_gemm_int8.cpp
index 4da941a57..ed233a5ad 100644
--- a/test/batched_gemm/batched_gemm_int8.cpp
+++ b/test/batched_gemm/batched_gemm_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
index aa113de21..1a8d5c2e5 100644
--- a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "test_batched_gemm_gemm_util.hpp"
diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
index 53c4d37c4..b0fffc466 100644
--- a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
index b150ce50d..dd2638ce8 100644
--- a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
+++ b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp
index 5df7769d5..cb46a995c 100644
--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_util.hpp"
diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
index 98debe19c..d8ee744c6 100644
--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp
index fe65a6fb9..ef88ce6d8 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "test_batched_gemm_bias_softmax_gemm_permute_util.hpp"
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp
index 7235cd1b0..b38b10d19 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
index af5f0efec..d7c39367c 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp
index defe36124..8e0baede1 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp
index 293acd601..81d404109 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
index 912bbc91e..9df03ffd2 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batchnorm/batchnorm_bwd_rank_4.cpp b/test/batchnorm/batchnorm_bwd_rank_4.cpp
index caa7331ea..a4696cf2a 100644
--- a/test/batchnorm/batchnorm_bwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_bwd_rank_4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/test/batchnorm/batchnorm_fwd_rank_4.cpp b/test/batchnorm/batchnorm_fwd_rank_4.cpp
index 13aef7d6b..9b6fbd0f6 100644
--- a/test/batchnorm/batchnorm_fwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_fwd_rank_4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/test/batchnorm/batchnorm_infer_rank_4.cpp b/test/batchnorm/batchnorm_infer_rank_4.cpp
index 77fc1daae..ecb4043b3 100644
--- a/test/batchnorm/batchnorm_infer_rank_4.cpp
+++ b/test/batchnorm/batchnorm_infer_rank_4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/test/block_to_ctile_map/test_block_to_ctile_map.cpp b/test/block_to_ctile_map/test_block_to_ctile_map.cpp
index 55d9b59f4..b8e349eda 100644
--- a/test/block_to_ctile_map/test_block_to_ctile_map.cpp
+++ b/test/block_to_ctile_map/test_block_to_ctile_map.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/test/conv_util/conv_util.cpp b/test/conv_util/conv_util.cpp
index 73797a716..6922bbbcc 100644
--- a/test/conv_util/conv_util.cpp
+++ b/test/conv_util/conv_util.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <string>
diff --git a/test/convnd_bwd_data/convnd_bwd_data.cpp b/test/convnd_bwd_data/convnd_bwd_data.cpp
index 70231d42a..9d2b6cf57 100644
--- a/test/convnd_bwd_data/convnd_bwd_data.cpp
+++ b/test/convnd_bwd_data/convnd_bwd_data.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/test/convnd_fwd/convnd_fwd.cpp b/test/convnd_fwd/convnd_fwd.cpp
index a1921a9bf..fe8798ceb 100644
--- a/test/convnd_fwd/convnd_fwd.cpp
+++ b/test/convnd_fwd/convnd_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/test/data_type/int4.cpp b/test/data_type/int4.cpp
index 252a450bf..07549c1c4 100644
--- a/test/data_type/int4.cpp
+++ b/test/data_type/int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <bitset>
 #include <cinttypes>
diff --git a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
index e80995c4f..d5ce77dc2 100644
--- a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
+++ b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_elementwise_layernorm_impl.hpp"
diff --git a/test/gemm/gemm_bf16.cpp b/test/gemm/gemm_bf16.cpp
index 5290d4663..cde5c45ae 100644
--- a/test/gemm/gemm_bf16.cpp
+++ b/test/gemm/gemm_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/test/gemm/gemm_fp16.cpp b/test/gemm/gemm_fp16.cpp
index 92e225def..cad250c6f 100644
--- a/test/gemm/gemm_fp16.cpp
+++ b/test/gemm/gemm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/test/gemm/gemm_fp32.cpp b/test/gemm/gemm_fp32.cpp
index 5d8c4881b..c35aa77ea 100644
--- a/test/gemm/gemm_fp32.cpp
+++ b/test/gemm/gemm_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_fp64.cpp
index 85d7f95bf..e67c8ba4f 100644
--- a/test/gemm/gemm_fp64.cpp
+++ b/test/gemm/gemm_fp64.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/test/gemm/gemm_int8.cpp b/test/gemm/gemm_int8.cpp
index e73b22ce9..6ece05e30 100644
--- a/test/gemm/gemm_int8.cpp
+++ b/test/gemm/gemm_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/test/gemm/gemm_standalone_xdl_fp16.cpp b/test/gemm/gemm_standalone_xdl_fp16.cpp
index 32a243e0f..201a49dcd 100644
--- a/test/gemm/gemm_standalone_xdl_fp16.cpp
+++ b/test/gemm/gemm_standalone_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_util.hpp"
 
diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp
index 9057c0af8..6c46f4ee8 100644
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/test/gemm/instance/gemm_f16_nn_instance.cpp b/test/gemm/instance/gemm_f16_nn_instance.cpp
index 4d65c5876..9016257f1 100644
--- a/test/gemm/instance/gemm_f16_nn_instance.cpp
+++ b/test/gemm/instance/gemm_f16_nn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_f16_nn_instance.hpp b/test/gemm/instance/gemm_f16_nn_instance.hpp
index 5ae3928dc..e174b99a1 100644
--- a/test/gemm/instance/gemm_f16_nn_instance.hpp
+++ b/test/gemm/instance/gemm_f16_nn_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_f16_nt_instance.cpp b/test/gemm/instance/gemm_f16_nt_instance.cpp
index 431ff1e62..27103b88d 100644
--- a/test/gemm/instance/gemm_f16_nt_instance.cpp
+++ b/test/gemm/instance/gemm_f16_nt_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_f16_nt_instance.hpp b/test/gemm/instance/gemm_f16_nt_instance.hpp
index 99f9ffba4..c624425e6 100644
--- a/test/gemm/instance/gemm_f16_nt_instance.hpp
+++ b/test/gemm/instance/gemm_f16_nt_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_f16_tn_instance.cpp b/test/gemm/instance/gemm_f16_tn_instance.cpp
index 6f5dbc311..5b11f4dad 100644
--- a/test/gemm/instance/gemm_f16_tn_instance.cpp
+++ b/test/gemm/instance/gemm_f16_tn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_f16_tn_instance.hpp b/test/gemm/instance/gemm_f16_tn_instance.hpp
index 62388aeb3..563e10600 100644
--- a/test/gemm/instance/gemm_f16_tn_instance.hpp
+++ b/test/gemm/instance/gemm_f16_tn_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_f16_tt_instance.cpp b/test/gemm/instance/gemm_f16_tt_instance.cpp
index b6ef5b1cd..9032150f0 100644
--- a/test/gemm/instance/gemm_f16_tt_instance.cpp
+++ b/test/gemm/instance/gemm_f16_tt_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_f16_tt_instance.hpp b/test/gemm/instance/gemm_f16_tt_instance.hpp
index 9d75b4e48..62914d7ac 100644
--- a/test/gemm/instance/gemm_f16_tt_instance.hpp
+++ b/test/gemm/instance/gemm_f16_tt_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_wavelet_f16_tn_instance.cpp b/test/gemm/instance/gemm_wavelet_f16_tn_instance.cpp
index 51c014a91..983af7ecd 100644
--- a/test/gemm/instance/gemm_wavelet_f16_tn_instance.cpp
+++ b/test/gemm/instance/gemm_wavelet_f16_tn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_wavelet_f16_tn_instance.hpp b/test/gemm/instance/gemm_wavelet_f16_tn_instance.hpp
index 110fc5f7d..ef269d78e 100644
--- a/test/gemm/instance/gemm_wavelet_f16_tn_instance.hpp
+++ b/test/gemm/instance/gemm_wavelet_f16_tn_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <memory>
 #include <vector>
diff --git a/test/gemm/run_gemm_test.inc b/test/gemm/run_gemm_test.inc
index ec27729b3..d208bb5a7 100644
--- a/test/gemm/run_gemm_test.inc
+++ b/test/gemm/run_gemm_test.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 int run_gemm_test()
 {
diff --git a/test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16.cpp b/test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16.cpp
index 740c63aa7..3f0599687 100644
--- a/test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16.cpp
+++ b/test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_gemm_add_relu_add_layernorm_impl.hpp"
diff --git a/test/gemm_reduce/gemm_reduce_fp16.cpp b/test/gemm_reduce/gemm_reduce_fp16.cpp
index 029165ece..35a149f52 100644
--- a/test/gemm_reduce/gemm_reduce_fp16.cpp
+++ b/test/gemm_reduce/gemm_reduce_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp b/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
index 75f934cc0..207cdab7c 100644
--- a/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp b/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
index 6df7f9969..4a804ef7f 100644
--- a/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/test/magic_number_division/magic_number_division.cpp b/test/magic_number_division/magic_number_division.cpp
index 680fddf19..253f21e91 100644
--- a/test/magic_number_division/magic_number_division.cpp
+++ b/test/magic_number_division/magic_number_division.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/test/normalization/test_groupnorm_fp16.cpp b/test/normalization/test_groupnorm_fp16.cpp
index 60d3b1395..325ea75fe 100644
--- a/test/normalization/test_groupnorm_fp16.cpp
+++ b/test/normalization/test_groupnorm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_groupnorm_impl.hpp"
diff --git a/test/normalization/test_groupnorm_fp32.cpp b/test/normalization/test_groupnorm_fp32.cpp
index 3542f73a6..ec88442fc 100644
--- a/test/normalization/test_groupnorm_fp32.cpp
+++ b/test/normalization/test_groupnorm_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_groupnorm_impl.hpp"
diff --git a/test/normalization/test_layernorm2d_fp16.cpp b/test/normalization/test_layernorm2d_fp16.cpp
index d627cbe7f..2222740fc 100644
--- a/test/normalization/test_layernorm2d_fp16.cpp
+++ b/test/normalization/test_layernorm2d_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_layernorm_impl.hpp"
diff --git a/test/normalization/test_layernorm2d_fp32.cpp b/test/normalization/test_layernorm2d_fp32.cpp
index de4133aa8..30fbe06c6 100644
--- a/test/normalization/test_layernorm2d_fp32.cpp
+++ b/test/normalization/test_layernorm2d_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_layernorm_impl.hpp"
diff --git a/test/pool_fwd/test_avg_pool2d_fwd.cpp b/test/pool_fwd/test_avg_pool2d_fwd.cpp
index 4e5f1e0e9..72749fd6e 100644
--- a/test/pool_fwd/test_avg_pool2d_fwd.cpp
+++ b/test/pool_fwd/test_avg_pool2d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_pool2d_fwd_impl.hpp"
diff --git a/test/pool_fwd/test_avg_pool3d_fwd.cpp b/test/pool_fwd/test_avg_pool3d_fwd.cpp
index 0d6b105b1..00cc3740f 100644
--- a/test/pool_fwd/test_avg_pool3d_fwd.cpp
+++ b/test/pool_fwd/test_avg_pool3d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_pool3d_fwd_impl.hpp"
diff --git a/test/pool_fwd/test_max_pool2d_fwd.cpp b/test/pool_fwd/test_max_pool2d_fwd.cpp
index d16ac7fab..1cf1314f4 100644
--- a/test/pool_fwd/test_max_pool2d_fwd.cpp
+++ b/test/pool_fwd/test_max_pool2d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_pool2d_fwd_impl.hpp"
diff --git a/test/pool_fwd/test_max_pool3d_fwd.cpp b/test/pool_fwd/test_max_pool3d_fwd.cpp
index f084dd9cb..0b0de4d90 100644
--- a/test/pool_fwd/test_max_pool3d_fwd.cpp
+++ b/test/pool_fwd/test_max_pool3d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_pool3d_fwd_impl.hpp"
diff --git a/test/pool_fwd/test_pool_fwd_common.hpp b/test/pool_fwd/test_pool_fwd_common.hpp
index a78785171..f01863517 100644
--- a/test/pool_fwd/test_pool_fwd_common.hpp
+++ b/test/pool_fwd/test_pool_fwd_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "ck/ck.hpp"
diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp
index 3f4d0676b..1ab452442 100644
--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <getopt.h>
 
diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp
index c616a68e7..0301669c5 100644
--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <getopt.h>
 
diff --git a/test/reference_conv_fwd/reference_conv_fwd.cpp b/test/reference_conv_fwd/reference_conv_fwd.cpp
index 1f9ba0064..b3328e4b3 100644
--- a/test/reference_conv_fwd/reference_conv_fwd.cpp
+++ b/test/reference_conv_fwd/reference_conv_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cmath>
 #include <cstdlib>
diff --git a/test/softmax/test_softmax_interface.cpp b/test/softmax/test_softmax_interface.cpp
index 8cac0ba0f..25f666f0e 100644
--- a/test/softmax/test_softmax_interface.cpp
+++ b/test/softmax/test_softmax_interface.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <stdexcept>
 #include <vector>
diff --git a/test/softmax/test_softmax_rank3.cpp b/test/softmax/test_softmax_rank3.cpp
index 5691ee3f6..24ad912d8 100644
--- a/test/softmax/test_softmax_rank3.cpp
+++ b/test/softmax/test_softmax_rank3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <stdexcept>
diff --git a/test/softmax/test_softmax_rank4.cpp b/test/softmax/test_softmax_rank4.cpp
index f0b22df25..b58301fb1 100644
--- a/test/softmax/test_softmax_rank4.cpp
+++ b/test/softmax/test_softmax_rank4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <stdexcept>
diff --git a/test/softmax/test_softmax_util.hpp b/test/softmax/test_softmax_util.hpp
index 40b300cf9..e36231de8 100644
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/test/space_filling_curve/space_filling_curve.cpp b/test/space_filling_curve/space_filling_curve.cpp
index c7f6759e8..a192ecb28 100644
--- a/test/space_filling_curve/space_filling_curve.cpp
+++ b/test/space_filling_curve/space_filling_curve.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 #include <iostream>
diff --git a/test/wmma_op/wmma_op.cpp b/test/wmma_op/wmma_op.cpp
index 761c15f1d..47d8c7ed6 100644
--- a/test/wmma_op/wmma_op.cpp
+++ b/test/wmma_op/wmma_op.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/test/wmma_op/wmma_op_util.hpp b/test/wmma_op/wmma_op_util.hpp
index c70e6a407..49782bce6 100644
--- a/test/wmma_op/wmma_op_util.hpp
+++ b/test/wmma_op/wmma_op_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
-- 
GitLab


From 9eae73df9b63d04bdfb7117c27dbe06da054c5d6 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Fri, 2 Jun 2023 05:23:02 +0800
Subject: [PATCH 49/71] Simplify kernel argument of device operator
 Device(Batched)GemmXdl<> (#723)

* Remove M/N/KPad local variables

* Use M/N/KPad to name padded lengths

* Replace duplicated local variable by parameters

* Rename variables M/N/KRaw to M/N/K

* Move AK0/BK0 compute logic into GridwiseGemm

* Use macro to shorten code

* Move CalculateGridSize() logic into GridwiseGemm

* Add comment to credit the implementation source

* Reuse the existing implementation

* Remove no-longer used data members

* Remove elementwise-op objects from interfaces

* Reserve kernel arg as whole object in interfaces

* Remove redundant data member

* Make 3rd type parameter optional

* Remove unnesscary type parameters

* Remove no-longer used descriptor-creation methods

* Move kernel arg type definition into GridwiseGemm

* Add macro to switch between code sections

* Move argument field computing logic into device op side

* Make utility method 'static'

* Declare special methods

* Unify MakeArgument() usage

* Adapt the new GridwiseGemm interface

* Push-down class 'GridwiseGemm::Argument' fields

* Remove no-longer used methods

* Add unused parameters

* Force copying parameters in 'Embed' ctor

* Remove no-longer used descriptors

* Fallback change on BaseArgument

* Remove macro 'INTEGER_DIVIDE_CEIL'

* Make variable naming more consistent

* Make sure methods are only invoked on right place

* Remove tailing underscore in public attribute name

* Remove necessary methods

* Hide computing logic of derived attributes

* Make new 'Embed' ctor only available for device code

* Make sure 'Embed' type args are not references

* Move check for karg.K into CheckValidity()

* Remove more integer division logic form device code

* Undo changes on Embed

* Separate 'Problem' concept out from 'Argument'

* Add overloaded version of __builtin_amdgcn_readfirstlane()

* Remove 'static' specifiers

* Remove more 'static' specifier

* Replace unsigne char by std::byte

* Add 'const' specifier to never changing variable

* Add 'inline' specifier to funcion definition

* Share same name for kernel interfaces

* Fix wrong boundar calculation logic

* Leave the third template arg for compatibility

* Remove unnecessary parameters

* Fix wrong error message (for type name)

* Create descriptor on device side

* Fix wrong debug message

* Remove no-longer used data members

* Rename type trait

* Remove std:: qualifier from standard types

* Replace 'size_t' by 'unsigned'

* Use type alias to hint usage

* Replace static_for<> by ordinary 'for' loop

* Reject unsupported argument

* Rename readfirstlane() to amd_wave_read_first_lane()

* Rename file readfirstlance.hpp as amd_wave_read_first_lane.hpp

* Update function calls

* Reorder statements

* Re-format files

---------

Co-authored-by: zjing14 <zhangjing14@gmail.com>
---
 .../convnd_bwd_data_common.hpp                |  43 +-
 .../device/impl/device_batched_gemm_xdl.hpp   | 467 ++++-----------
 ...ice_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp | 184 ++----
 .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp  | 132 +----
 ...device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp | 217 ++-----
 .../gpu/device/impl/device_gemm_xdl.hpp       | 329 +----------
 .../gpu/grid/block_to_ctile_map.hpp           |  44 +-
 .../gpu/grid/gridwise_gemm_xdlops_v2r3.hpp    | 551 ++++++++++++++++--
 8 files changed, 820 insertions(+), 1147 deletions(-)

diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
index b4b544aab..4a9d16c5c 100644
--- a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
@@ -81,32 +81,33 @@ int run_conv_bwd_data(bool do_verification,
     in_device_buf.SetZero();
 
     // do GEMM
-    auto conv     = DeviceConvNdBwdDataInstance{};
-    auto invoker  = conv.MakeInvoker();
-    auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-                                      static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                                      static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                      conv_param.N_,
-                                      conv_param.K_,
-                                      conv_param.C_,
-                                      conv_param.input_spatial_lengths_,
-                                      conv_param.filter_spatial_lengths_,
-                                      conv_param.GetOutputSpatialLengths(),
-                                      conv_param.conv_filter_strides_,
-                                      conv_param.conv_filter_dilations_,
-                                      conv_param.input_left_pads_,
-                                      conv_param.input_right_pads_,
-                                      in_element_op,
-                                      wei_element_op,
-                                      out_element_op);
-
-    if(!conv.IsSupportedArgument(argument))
+    auto conv    = DeviceConvNdBwdDataInstance{};
+    auto invoker = conv.MakeInvoker();
+    auto argument =
+        conv.MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                 static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                 static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                 conv_param.N_,
+                                 conv_param.K_,
+                                 conv_param.C_,
+                                 conv_param.input_spatial_lengths_,
+                                 conv_param.filter_spatial_lengths_,
+                                 conv_param.GetOutputSpatialLengths(),
+                                 conv_param.conv_filter_strides_,
+                                 conv_param.conv_filter_dilations_,
+                                 conv_param.input_left_pads_,
+                                 conv_param.input_right_pads_,
+                                 in_element_op,
+                                 wei_element_op,
+                                 out_element_op);
+
+    if(!conv.IsSupportedArgument(argument.get()))
     {
         std::cout << "Not support,please check parameters or device";
         return 0;
     }
 
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    float ave_time = invoker.Run(argument.get(), StreamConfig{nullptr, time_kernel});
 
     std::size_t flop      = conv_param.GetFlops();
     std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
index b10096706..87c58f371 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
@@ -45,75 +45,46 @@ namespace device {
  * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
  *
  */
-template <typename GridwiseGemm,
-          typename FloatAB,
-          typename FloatC,
-          typename AGridDesc_K0_M_K1,
-          typename BGridDesc_K0_N_K1,
-          typename CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation,
-          typename ComputePtrOffsetOfBatch,
-          typename Block2CTileMap,
-          bool HasMainKBlockLoop>
+template <typename DeviceOp, typename GridwiseGemm, bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batched_gemm_xdlops_v2r3(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const index_t batch_count,
-            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
-            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
-            const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-            const Block2CTileMap block_2_ctile_map)
+        kernel_batched_gemm_xdlops_v2r3(const typename DeviceOp::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx940__))
     const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        __builtin_amdgcn_readfirstlane(get_grid_size() / karg.Batch);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
     const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+        static_cast<long_index_t>(karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
     const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+        static_cast<long_index_t>(karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
     const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
+        static_cast<long_index_t>(karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
 
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                  p_b_grid + b_batch_offset,
-                                                  p_c_grid + c_batch_offset,
+    const auto a_grid_desc_k0_m_k1 =
+        amd_wave_read_first_lane(GridwiseGemm::MakeAGridDescriptor_K0_M_K1(
+            karg.M, karg.MPadded, karg.K, karg.K0, karg.StrideA));
+    const auto b_grid_desc_k0_n_k1 =
+        amd_wave_read_first_lane(GridwiseGemm::MakeBGridDescriptor_K0_N_K1(
+            karg.K, karg.N, karg.NPadded, karg.K0, karg.StrideB));
+    const auto c_grid_desc_m_n = amd_wave_read_first_lane(GridwiseGemm::MakeCGridDescriptor_M_N(
+        karg.M, karg.MPadded, karg.N, karg.NPadded, karg.StrideC));
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(karg.p_a_grid + a_batch_offset,
+                                                  karg.p_b_grid + b_batch_offset,
+                                                  karg.p_c_grid + c_batch_offset,
                                                   p_shared,
                                                   a_grid_desc_k0_m_k1,
                                                   b_grid_desc_k0_n_k1,
-                                                  c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  block_2_ctile_map);
+                                                  c_grid_desc_m_n);
 #else
-    ignore = p_a_grid;
-    ignore = p_b_grid;
-    ignore = p_c_grid;
-    ignore = batch_count;
-    ignore = a_grid_desc_k0_m_k1;
-    ignore = b_grid_desc_k0_n_k1;
-    ignore = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2;
-    ignore = a_element_op;
-    ignore = b_element_op;
-    ignore = c_element_op;
-    ignore = compute_ptr_offset_of_batch;
-    ignore = block_2_ctile_map;
+    ignore = karg;
 #endif
 }
 
@@ -171,93 +142,6 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
 
     static constexpr auto K1Number = Number<K1>{};
 
-    static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
-    {
-        assert(K % K1 == 0);
-
-        const index_t K0 = K / K1;
-
-        const auto a_grid_desc_m_k = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
-            }
-        }();
-
-        const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-
-        const auto a_grid_desc_k0_mp_k1 =
-            transform_tensor_descriptor(a_grid_desc_m_k,
-                                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                                                   make_right_pad_transform(M, PadM)),
-                                        make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-        return a_grid_desc_k0_mp_k1;
-    }
-
-    static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
-    {
-        assert(K % K1 == 0);
-
-        const index_t K0 = K / K1;
-
-        const auto b_grid_desc_k_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
-            }
-        }();
-
-        const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-
-        const auto b_grid_desc_k0_np_k1 =
-            transform_tensor_descriptor(b_grid_desc_k_n,
-                                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                                                   make_right_pad_transform(N, PadN)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-        return b_grid_desc_k0_np_k1;
-    }
-
-    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
-    {
-        const auto c_grid_desc_m_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
-            }
-        }();
-
-        const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-        const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-
-        const auto c_grid_desc_mp_np = transform_tensor_descriptor(
-            c_grid_desc_m_n,
-            make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        return c_grid_desc_mp_np;
-    }
-
-    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1));
-    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1));
-    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
-
     struct ComputePtrOffsetOfStridedBatch
     {
         ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
@@ -289,121 +173,82 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
     };
 
     // GridwiseGemm
-    using GridwiseGemm =
-        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                ADataType, // TODO: distinguish A/B datatype
-                                                AccDataType,
-                                                CDataType,
-                                                InMemoryDataOperationEnum::Set,
-                                                AGridDesc_K0_M_K1,
-                                                BGridDesc_K0_N_K1,
-                                                CGridDesc_M_N,
-                                                AElementwiseOperation,
-                                                BElementwiseOperation,
-                                                CElementwiseOperation,
-                                                MPerBlock,
-                                                NPerBlock,
-                                                K0PerBlock,
-                                                MPerXDL,
-                                                NPerXDL,
-                                                K1,
-                                                MXdlPerWave,
-                                                NXdlPerWave,
-                                                ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                ABlockTransferThreadClusterArrangeOrder,
-                                                ABlockTransferSrcAccessOrder,
-                                                ABlockTransferSrcVectorDim,
-                                                ABlockTransferSrcScalarPerVector,
-                                                ABlockTransferDstScalarPerVector_K1,
-                                                false, // AThreadTransferSrcResetCoordinateAfterRun,
-                                                ABlockLdsAddExtraM,
-                                                BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                BBlockTransferThreadClusterArrangeOrder,
-                                                BBlockTransferSrcAccessOrder,
-                                                BBlockTransferSrcVectorDim,
-                                                BBlockTransferSrcScalarPerVector,
-                                                BBlockTransferDstScalarPerVector_K1,
-                                                false, // BThreadTransferSrcResetCoordinateAfterRun,
-                                                BBlockLdsAddExtraN,
-                                                Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
-                                                CThreadTransferSrcDstVectorDim,
-                                                CThreadTransferDstScalarPerVector,
-                                                NumGemmKPrefetchStage,
-                                                LoopSched,
-                                                PipelineVer>;
-
-    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =
-        decltype(GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(CGridDesc_M_N{}));
-    using Block2CTileMap = typename GridwiseGemm::DefaultBlock2CTileMap;
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum::Set,
+        ALayout,
+        BLayout,
+        CLayout,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpecialization::MNPadding,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXDL,
+        NPerXDL,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+        CThreadTransferSrcDstVectorDim,
+        CThreadTransferDstScalarPerVector,
+        NumGemmKPrefetchStage,
+        LoopSched,
+        PipelineVer>;
+
+    using Problem = typename GridwiseGemm::Problem;
 
     // Argument
-    struct Argument : public BaseArgument
+    struct Argument : public Problem, public BaseArgument
     {
-        Argument(const ADataType* p_a_grid,
-                 const BDataType* p_b_grid,
-                 CDataType* p_c_grid,
-                 index_t M,
-                 index_t N,
-                 index_t K,
-                 index_t StrideA,
-                 index_t StrideB,
-                 index_t StrideC,
+        Argument(const ADataType* p_a_grid_,
+                 const BDataType* p_b_grid_,
+                 CDataType* p_c_grid_,
+                 index_t M_,
+                 index_t N_,
+                 index_t K_,
+                 index_t StrideA_,
+                 index_t StrideB_,
+                 index_t StrideC_,
                  index_t BatchStrideA,
                  index_t BatchStrideB,
                  index_t BatchStrideC,
-                 index_t Batch,
-                 index_t M01,
-                 index_t N01,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-            : p_a_grid_{p_a_grid},
-              p_b_grid_{p_b_grid},
-              p_c_grid_{p_c_grid},
-              Batch_(Batch),
-              a_grid_desc_k0_m_k1_{
-                  DeviceBatchedGemmXdl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA)},
-              b_grid_desc_k0_n_k1_{
-                  DeviceBatchedGemmXdl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB)},
-              c_grid_desc_m_n_{DeviceBatchedGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC)},
-              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
-              compute_ptr_offset_of_batch_{BatchStrideA, BatchStrideB, BatchStrideC},
-              block_2_ctile_map_{
-                  GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01)},
-              M01_{M01},
-              N01_{N01},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op},
-              kraw_{K}
+                 index_t Batch_)
+            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_},
+              p_a_grid{p_a_grid_},
+              p_b_grid{p_b_grid_},
+              p_c_grid{p_c_grid_},
+              Batch(Batch_),
+              compute_ptr_offset_of_batch{BatchStrideA, BatchStrideB, BatchStrideC}
         {
-            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
-                                           b_grid_desc_k0_n_k1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ =
-                    GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n_);
-            }
         }
 
-        //  private:
-        const ADataType* p_a_grid_;
-        const BDataType* p_b_grid_;
-        CDataType* p_c_grid_;
-        index_t Batch_;
-        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
-        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
-        CGridDesc_M_N c_grid_desc_m_n_;
-        CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
-        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
-        Block2CTileMap block_2_ctile_map_;
-        index_t M01_;
-        index_t N01_;
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
-        index_t kraw_;
+        const ADataType* p_a_grid;
+        const BDataType* p_b_grid;
+        CDataType* p_c_grid;
+        index_t Batch;
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch;
     };
 
     // Invoker
@@ -411,107 +256,39 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
     {
         using Argument = DeviceBatchedGemmXdl::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        float Run(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
         {
-#if DEBUG_LOG
+            if(stream_config.log_level_ > 0)
             {
-                std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
-                          << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
-                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0)
-                          << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
-                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.c_grid_desc_m_n_{" << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
-                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+                karg.Print();
             }
-#endif
 
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                            arg.b_grid_desc_k0_n_k1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
+            if(!GridwiseGemm::CheckValidity(karg))
             {
                 throw std::runtime_error(
-                    "wrong! GridwiseBatchedGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
+                    "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext has invalid setting");
             }
 
-            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.Batch_;
-
-            const auto K =
-                arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
+            auto [gdx, gdy, gdz] = GridwiseGemm::CalculateGridSize(karg.M, karg.N);
+            gdx *= karg.Batch;
 
             float ave_time = 0;
 
-            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(karg.K))
             {
-                const auto kernel = kernel_batched_gemm_xdlops_v2r3<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    remove_reference_t<DeviceBatchedGemmXdl::AGridDesc_K0_M_K1>,
-                    remove_reference_t<DeviceBatchedGemmXdl::BGridDesc_K0_N_K1>,
-                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    ComputePtrOffsetOfStridedBatch,
-                    remove_reference_t<Block2CTileMap>,
-                    true>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(BlockSize),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.Batch_,
-                                                  arg.a_grid_desc_k0_m_k1_,
-                                                  arg.b_grid_desc_k0_n_k1_,
-                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                                  arg.a_element_op_,
-                                                  arg.b_element_op_,
-                                                  arg.c_element_op_,
-                                                  arg.compute_ptr_offset_of_batch_,
-                                                  arg.block_2_ctile_map_);
+                const auto kernel =
+                    kernel_batched_gemm_xdlops_v2r3<DeviceBatchedGemmXdl, GridwiseGemm, true>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg);
             }
             else
             {
-                const auto kernel = kernel_batched_gemm_xdlops_v2r3<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    remove_reference_t<DeviceBatchedGemmXdl::AGridDesc_K0_M_K1>,
-                    remove_reference_t<DeviceBatchedGemmXdl::BGridDesc_K0_N_K1>,
-                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    ComputePtrOffsetOfStridedBatch,
-                    remove_reference_t<Block2CTileMap>,
-                    false>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(BlockSize),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.Batch_,
-                                                  arg.a_grid_desc_k0_m_k1_,
-                                                  arg.b_grid_desc_k0_n_k1_,
-                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                                  arg.a_element_op_,
-                                                  arg.b_element_op_,
-                                                  arg.c_element_op_,
-                                                  arg.compute_ptr_offset_of_batch_,
-                                                  arg.block_2_ctile_map_);
+                const auto kernel =
+                    kernel_batched_gemm_xdlops_v2r3<DeviceBatchedGemmXdl, GridwiseGemm, false>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg);
             }
 
             return ave_time;
@@ -531,17 +308,14 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
         return true;
     }
 
-    static bool IsSupportedArgument(const Argument& arg)
+    static bool IsSupportedArgument(const Problem& problem)
     {
-        if(arg.kraw_ % K1 != 0)
+        if(problem.K % K1 != 0)
         {
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                           arg.b_grid_desc_k0_n_k1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        return GridwiseGemm::CheckValidity(problem);
     }
 
     // polymorphic
@@ -562,10 +336,7 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
                              index_t BatchStrideA,
                              index_t BatchStrideB,
                              index_t BatchStrideC,
-                             index_t Batch,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
+                             index_t Batch)
     {
         return Argument{p_a,
                         p_b,
@@ -579,12 +350,7 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
                         BatchStrideA,
                         BatchStrideB,
                         BatchStrideC,
-                        Batch,
-                        1,
-                        1,
-                        a_element_op,
-                        b_element_op,
-                        c_element_op};
+                        Batch};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -603,9 +369,9 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
                                                       index_t BatchStrideB,
                                                       index_t BatchStrideC,
                                                       index_t Batch,
-                                                      AElementwiseOperation a_element_op,
-                                                      BElementwiseOperation b_element_op,
-                                                      CElementwiseOperation c_element_op) override
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation) override
     {
         return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                           static_cast<const BDataType*>(p_b),
@@ -619,12 +385,7 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
                                           BatchStrideA,
                                           BatchStrideB,
                                           BatchStrideC,
-                                          Batch,
-                                          1,
-                                          1,
-                                          a_element_op,
-                                          b_element_op,
-                                          c_element_op);
+                                          Batch);
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
index 8cef0eaf9..eb4db6f8c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -379,9 +379,6 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         AccDataType,
         CDataType,
         InMemoryDataOperationEnum::Set,
-        AGridDesc_K0_M_K1,
-        BGridDesc_K0_N_K1,
-        CGridDesc_M_N,
         InElementwiseOperation,
         WeiElementwiseOperation,
         OutElementwiseOperation,
@@ -428,20 +425,10 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                  std::vector<ck::index_t> conv_filter_strides,
                  std::vector<ck::index_t> conv_filter_dilations,
                  std::vector<ck::index_t> input_left_pads,
-                 std::vector<ck::index_t> input_right_pads,
-                 ck::index_t M01,
-                 ck::index_t N01,
-                 InElementwiseOperation in_element_op,
-                 WeiElementwiseOperation wei_element_op,
-                 OutElementwiseOperation out_element_op)
+                 std::vector<ck::index_t> input_right_pads)
             : p_a_grid_{p_out_grid},
               p_b_grid_{p_wei_grid},
               p_c_grid_{p_in_grid},
-              M01_{M01},
-              N01_{N01},
-              a_element_op_{out_element_op},
-              b_element_op_{wei_element_op},
-              c_element_op_{in_element_op},
               Conv_N_{N},
               Conv_K_{K},
               Conv_C_{C},
@@ -495,18 +482,6 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                     a_grid_desc_k0_m_k1_container_.push_back(descs[I0]);
                     b_grid_desc_k0_n_k1_container_.push_back(descs[I1]);
                     c_grid_desc_m_n_container_.push_back(descs[I2]);
-
-                    auto block_2_ctile_map =
-                        GridwiseGemm::MakeDefaultBlock2CTileMap(descs[I2], M01, N01);
-
-                    if(GridwiseGemm::CheckValidity(
-                           descs[I0], descs[I1], descs[I2], block_2_ctile_map))
-                    {
-                        c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_.push_back(
-                            GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(descs[I2]));
-
-                        block_2_ctile_map_container_.push_back(block_2_ctile_map);
-                    }
                 }
             }
         }
@@ -517,14 +492,6 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         std::vector<AGridDesc_K0_M_K1> a_grid_desc_k0_m_k1_container_;
         std::vector<BGridDesc_K0_N_K1> b_grid_desc_k0_n_k1_container_;
         std::vector<CGridDesc_M_N> c_grid_desc_m_n_container_;
-        std::vector<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>
-            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_;
-        std::vector<typename GridwiseGemm::DefaultBlock2CTileMap> block_2_ctile_map_container_;
-        index_t M01_;
-        index_t N01_;
-        OutElementwiseOperation a_element_op_;
-        WeiElementwiseOperation b_element_op_;
-        InElementwiseOperation c_element_op_;
         // for checking IsSupportedArgument()
         index_t Conv_N_;
         index_t Conv_K_;
@@ -567,103 +534,68 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                               << arg.c_grid_desc_m_n_container_[i].GetLength(I0) << ", "
                               << arg.c_grid_desc_m_n_container_[i].GetLength(I1) << "}"
                               << std::endl;
-
-                    std::cout << "arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I0)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I1)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I2)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I3)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I4)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I5)
-                              << " ) " << std::endl;
                 }
 #endif
 
                 if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
                                                 arg.b_grid_desc_k0_n_k1_container_[i],
-                                                arg.c_grid_desc_m_n_container_[i],
-                                                arg.block_2_ctile_map_container_[i]))
+                                                arg.c_grid_desc_m_n_container_[i]))
                 {
                     throw std::runtime_error(
                         "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r1 has invalid setting");
                 }
 
-                const index_t grid_size = arg.block_2_ctile_map_container_[i].CalculateGridSize(
-                    arg.c_grid_desc_m_n_container_[i]);
+                const auto [gdx, gdy, gdz] =
+                    GridwiseGemm::CalculateGridSize(arg.c_grid_desc_m_n_container_[i]);
 
                 const auto K = arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I0) *
                                arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I2);
 
                 if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r3<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                        remove_reference_t<
-                            typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                        OutElementwiseOperation,
-                        WeiElementwiseOperation,
-                        InElementwiseOperation,
-                        remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                        true>;
-
-                    ave_time += launch_and_time_kernel(
-                        stream_config,
-                        kernel,
-                        dim3(grid_size),
-                        dim3(BlockSize),
-                        0,
-                        arg.p_a_grid_,
-                        arg.p_b_grid_,
-                        arg.p_c_grid_,
-                        arg.a_grid_desc_k0_m_k1_container_[i],
-                        arg.b_grid_desc_k0_n_k1_container_[i],
-                        arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i],
-                        arg.a_element_op_,
-                        arg.b_element_op_,
-                        arg.c_element_op_,
-                        arg.block_2_ctile_map_container_[i]);
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r3<GridwiseGemm,
+                                                ADataType, // TODO: distiguish A/B datatype
+                                                CDataType,
+                                                DeviceOp::AGridDesc_K0_M_K1,
+                                                DeviceOp::BGridDesc_K0_N_K1,
+                                                DeviceOp::CGridDesc_M_N,
+                                                true>;
+
+                    ave_time += launch_and_time_kernel(stream_config,
+                                                       kernel,
+                                                       dim3(gdx, gdy, gdz),
+                                                       dim3(BlockSize),
+                                                       0,
+                                                       arg.p_a_grid_,
+                                                       arg.p_b_grid_,
+                                                       arg.p_c_grid_,
+                                                       arg.a_grid_desc_k0_m_k1_container_[i],
+                                                       arg.b_grid_desc_k0_n_k1_container_[i],
+                                                       arg.c_grid_desc_m_n_container_[i]);
                 }
                 else
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r3<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                        remove_reference_t<
-                            typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                        OutElementwiseOperation,
-                        WeiElementwiseOperation,
-                        InElementwiseOperation,
-                        remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                        false>;
-
-                    ave_time += launch_and_time_kernel(
-                        stream_config,
-                        kernel,
-                        dim3(grid_size),
-                        dim3(BlockSize),
-                        0,
-                        arg.p_a_grid_,
-                        arg.p_b_grid_,
-                        arg.p_c_grid_,
-                        arg.a_grid_desc_k0_m_k1_container_[i],
-                        arg.b_grid_desc_k0_n_k1_container_[i],
-                        arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i],
-                        arg.a_element_op_,
-                        arg.b_element_op_,
-                        arg.c_element_op_,
-                        arg.block_2_ctile_map_container_[i]);
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r3<GridwiseGemm,
+                                                ADataType, // TODO: distiguish A/B datatype
+                                                CDataType,
+                                                DeviceOp::AGridDesc_K0_M_K1,
+                                                DeviceOp::BGridDesc_K0_N_K1,
+                                                DeviceOp::CGridDesc_M_N,
+                                                false>;
+
+                    ave_time += launch_and_time_kernel(stream_config,
+                                                       kernel,
+                                                       dim3(gdx, gdy, gdz),
+                                                       dim3(BlockSize),
+                                                       0,
+                                                       arg.p_a_grid_,
+                                                       arg.p_b_grid_,
+                                                       arg.p_c_grid_,
+                                                       arg.a_grid_desc_k0_m_k1_container_[i],
+                                                       arg.b_grid_desc_k0_n_k1_container_[i],
+                                                       arg.c_grid_desc_m_n_container_[i]);
                 }
             }
             return ave_time;
@@ -716,8 +648,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
                                             arg.b_grid_desc_k0_n_k1_container_[i],
-                                            arg.c_grid_desc_m_n_container_[i],
-                                            arg.block_2_ctile_map_container_[i]))
+                                            arg.c_grid_desc_m_n_container_[i]))
             {
                 return false;
             }
@@ -742,10 +673,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                              std::vector<ck::index_t> conv_filter_strides,
                              std::vector<ck::index_t> conv_filter_dilations,
                              std::vector<ck::index_t> input_left_pads,
-                             std::vector<ck::index_t> input_right_pads,
-                             InElementwiseOperation in_element_op,
-                             WeiElementwiseOperation wei_element_op,
-                             OutElementwiseOperation out_element_op)
+                             std::vector<ck::index_t> input_right_pads)
     {
         return Argument{p_in_grid,
                         p_wei_grid,
@@ -759,12 +687,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                         conv_filter_strides,
                         conv_filter_dilations,
                         input_left_pads,
-                        input_right_pads,
-                        1,
-                        1,
-                        in_element_op,
-                        wei_element_op,
-                        out_element_op};
+                        input_right_pads};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -783,9 +706,9 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                         std::vector<ck::index_t> conv_filter_dilations,
                         std::vector<ck::index_t> input_left_pads,
                         std::vector<ck::index_t> input_right_pads,
-                        InElementwiseOperation in_element_op,
-                        WeiElementwiseOperation wei_element_op,
-                        OutElementwiseOperation out_element_op) override
+                        InElementwiseOperation,
+                        WeiElementwiseOperation,
+                        OutElementwiseOperation) override
     {
         return std::make_unique<Argument>(static_cast<InDataType*>(p_in_grid),
                                           static_cast<const WeiDataType*>(p_wei_grid),
@@ -799,12 +722,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                                           conv_filter_strides,
                                           conv_filter_dilations,
                                           input_left_pads,
-                                          input_right_pads,
-                                          1,
-                                          1,
-                                          in_element_op,
-                                          wei_element_op,
-                                          out_element_op);
+                                          input_right_pads);
     }
 
     std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
index 710ea9176..88615bba3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -329,9 +329,6 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         AccDataType,
         CDataType,
         InMemoryDataOperationEnum::Set,
-        AGridDesc_K0_M_K1,
-        BGridDesc_K0_N_K1,
-        CGridDesc_M_N,
         InElementwiseOperation,
         WeiElementwiseOperation,
         OutElementwiseOperation,
@@ -378,25 +375,13 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                  std::vector<ck::index_t> conv_filter_strides,
                  std::vector<ck::index_t> conv_filter_dilations,
                  std::vector<ck::index_t> input_left_pads,
-                 std::vector<ck::index_t> input_right_pads,
-                 ck::index_t M01,
-                 ck::index_t N01,
-                 InElementwiseOperation in_element_op,
-                 WeiElementwiseOperation wei_element_op,
-                 OutElementwiseOperation out_element_op)
+                 std::vector<ck::index_t> input_right_pads)
             : p_a_grid_{p_in_grid},
               p_b_grid_{p_wei_grid},
               p_c_grid_{p_out_grid},
               a_grid_desc_k0_m_k1_{},
               b_grid_desc_k0_n_k1_{},
               c_grid_desc_m_n_{},
-              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
-              block_2_ctile_map_{},
-              M01_{M01},
-              N01_{N01},
-              in_element_op_{in_element_op},
-              wei_element_op_{wei_element_op},
-              out_element_op_{out_element_op},
               Conv_N_{N},
               Conv_K_{K},
               Conv_C_{C},
@@ -420,17 +405,6 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
             a_grid_desc_k0_m_k1_ = descs[I0];
             b_grid_desc_k0_n_k1_ = descs[I1];
             c_grid_desc_m_n_     = descs[I2];
-            block_2_ctile_map_ =
-                GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
-
-            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
-                                           b_grid_desc_k0_n_k1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ =
-                    GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n_);
-            }
         }
 
         //  private:
@@ -440,14 +414,6 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
         BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
         CGridDesc_M_N c_grid_desc_m_n_;
-        typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2
-            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
-        index_t M01_;
-        index_t N01_;
-        InElementwiseOperation in_element_op_;
-        WeiElementwiseOperation wei_element_op_;
-        OutElementwiseOperation out_element_op_;
         // for checking IsSupportedArgument()
         index_t Conv_N_;
         index_t Conv_K_;
@@ -479,17 +445,14 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                           << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
             }
 #endif
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                            arg.b_grid_desc_k0_n_k1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
+            if(!GridwiseGemm::CheckValidity(
+                   arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_))
             {
                 throw std::runtime_error(
                     "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
             }
 
-            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+            const auto [gdx, gdy, gdz] = GridwiseGemm::CalculateGridSize(arg.c_grid_desc_m_n_);
 
             const auto K =
                 arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
@@ -498,22 +461,18 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
 
             if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
             {
-                const auto kernel = kernel_gemm_xdlops_v2r3<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    InElementwiseOperation,
-                    WeiElementwiseOperation,
-                    OutElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                    true>;
+                const auto kernel =
+                    kernel_gemm_xdlops_v2r3<GridwiseGemm,
+                                            ADataType, // TODO: distiguish A/B datatype
+                                            CDataType,
+                                            DeviceOp::AGridDesc_K0_M_K1,
+                                            DeviceOp::BGridDesc_K0_N_K1,
+                                            DeviceOp::CGridDesc_M_N,
+                                            true>;
 
                 ave_time = launch_and_time_kernel(stream_config,
                                                   kernel,
-                                                  dim3(grid_size),
+                                                  dim3(gdx, gdy, gdz),
                                                   dim3(BlockSize),
                                                   0,
                                                   arg.p_a_grid_,
@@ -521,30 +480,22 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                                                   arg.p_c_grid_,
                                                   arg.a_grid_desc_k0_m_k1_,
                                                   arg.b_grid_desc_k0_n_k1_,
-                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                                  arg.in_element_op_,
-                                                  arg.wei_element_op_,
-                                                  arg.out_element_op_,
-                                                  arg.block_2_ctile_map_);
+                                                  arg.c_grid_desc_m_n_);
             }
             else
             {
-                const auto kernel = kernel_gemm_xdlops_v2r3<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    InElementwiseOperation,
-                    WeiElementwiseOperation,
-                    OutElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                    false>;
+                const auto kernel =
+                    kernel_gemm_xdlops_v2r3<GridwiseGemm,
+                                            ADataType, // TODO: distiguish A/B datatype
+                                            CDataType,
+                                            DeviceOp::AGridDesc_K0_M_K1,
+                                            DeviceOp::BGridDesc_K0_N_K1,
+                                            DeviceOp::CGridDesc_M_N,
+                                            false>;
 
                 ave_time = launch_and_time_kernel(stream_config,
                                                   kernel,
-                                                  dim3(grid_size),
+                                                  dim3(gdx, gdy, gdz),
                                                   dim3(BlockSize),
                                                   0,
                                                   arg.p_a_grid_,
@@ -552,11 +503,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                                                   arg.p_c_grid_,
                                                   arg.a_grid_desc_k0_m_k1_,
                                                   arg.b_grid_desc_k0_n_k1_,
-                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                                  arg.in_element_op_,
-                                                  arg.wei_element_op_,
-                                                  arg.out_element_op_,
-                                                  arg.block_2_ctile_map_);
+                                                  arg.c_grid_desc_m_n_);
             }
 
             return ave_time;
@@ -616,10 +563,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         }
 
         // Gridwise GEMM size
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                           arg.b_grid_desc_k0_n_k1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        return GridwiseGemm::CheckValidity(
+            arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
@@ -639,10 +584,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                              std::vector<ck::index_t> conv_filter_strides,
                              std::vector<ck::index_t> conv_filter_dilations,
                              std::vector<ck::index_t> input_left_pads,
-                             std::vector<ck::index_t> input_right_pads,
-                             InElementwiseOperation in_element_op,
-                             WeiElementwiseOperation wei_element_op,
-                             OutElementwiseOperation out_element_op)
+                             std::vector<ck::index_t> input_right_pads)
     {
         return Argument{p_in_grid,
                         p_wei_grid,
@@ -656,12 +598,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                         conv_filter_strides,
                         conv_filter_dilations,
                         input_left_pads,
-                        input_right_pads,
-                        1,
-                        1,
-                        in_element_op,
-                        wei_element_op,
-                        out_element_op};
+                        input_right_pads};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -680,9 +617,9 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                         std::vector<ck::index_t> conv_filter_dilations,
                         std::vector<ck::index_t> input_left_pads,
                         std::vector<ck::index_t> input_right_pads,
-                        InElementwiseOperation in_element_op,
-                        WeiElementwiseOperation wei_element_op,
-                        OutElementwiseOperation out_element_op) override
+                        InElementwiseOperation,
+                        WeiElementwiseOperation,
+                        OutElementwiseOperation) override
     {
         return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
                                           static_cast<const WeiDataType*>(p_wei_grid),
@@ -696,12 +633,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                                           conv_filter_strides,
                                           conv_filter_dilations,
                                           input_left_pads,
-                                          input_right_pads,
-                                          1,
-                                          1,
-                                          in_element_op,
-                                          wei_element_op,
-                                          out_element_op);
+                                          input_right_pads);
     }
 
     std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
index 822e1da4e..77ad61d7e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
@@ -980,9 +980,6 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
         AccDataType,
         CDataType,
         InMemoryDataOperationEnum::Set,
-        AGridDesc_K0_M_K1,
-        BGridDesc_K0_N_K1,
-        CGridDesc_M_N,
         InElementwiseOperation,
         WeiElementwiseOperation,
         OutElementwiseOperation,
@@ -1029,20 +1026,10 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                  std::vector<ck::index_t> conv_filter_strides,
                  std::vector<ck::index_t> conv_filter_dilations,
                  std::vector<ck::index_t> input_left_pads,
-                 std::vector<ck::index_t> input_right_pads,
-                 ck::index_t M01,
-                 ck::index_t N01,
-                 InElementwiseOperation in_element_op,
-                 WeiElementwiseOperation wei_element_op,
-                 OutElementwiseOperation out_element_op)
+                 std::vector<ck::index_t> input_right_pads)
             : p_a_grid_{p_out_grid},
               p_b_grid_{p_wei_grid},
               p_c_grid_{p_in_grid},
-              M01_{M01},
-              N01_{N01},
-              a_element_op_{out_element_op},
-              b_element_op_{wei_element_op},
-              c_element_op_{in_element_op},
               Conv_N_{N},
               Conv_K_{K},
               Conv_C_{C},
@@ -1092,17 +1079,6 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                 a_grid_desc_k0_m_k1_container_.push_back(descs[I0]);
                 b_grid_desc_k0_n_k1_container_.push_back(descs[I1]);
                 c_grid_desc_m_n_container_.push_back(descs[I2]);
-
-                auto block_2_ctile_map =
-                    GridwiseGemm::MakeDefaultBlock2CTileMap(descs[I2], M01_, N01_);
-
-                if(GridwiseGemm::CheckValidity(descs[I0], descs[I1], descs[I2], block_2_ctile_map))
-                {
-                    c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_.push_back(
-                        GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(descs[I2]));
-
-                    block_2_ctile_map_container_.push_back(block_2_ctile_map);
-                }
             }
         }
         template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
@@ -1150,18 +1126,6 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                     a_grid_desc_k0_m_k1_container_.push_back(descs[I0]);
                     b_grid_desc_k0_n_k1_container_.push_back(descs[I1]);
                     c_grid_desc_m_n_container_.push_back(descs[I2]);
-
-                    auto block_2_ctile_map =
-                        GridwiseGemm::MakeDefaultBlock2CTileMap(descs[I2], M01_, N01_);
-
-                    if(GridwiseGemm::CheckValidity(
-                           descs[I0], descs[I1], descs[I2], block_2_ctile_map))
-                    {
-                        c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_.push_back(
-                            GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(descs[I2]));
-
-                        block_2_ctile_map_container_.push_back(block_2_ctile_map);
-                    }
                 }
             }
         }
@@ -1218,19 +1182,6 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                         a_grid_desc_k0_m_k1_container_.push_back(descs[I0]);
                         b_grid_desc_k0_n_k1_container_.push_back(descs[I1]);
                         c_grid_desc_m_n_container_.push_back(descs[I2]);
-
-                        auto block_2_ctile_map =
-                            GridwiseGemm::MakeDefaultBlock2CTileMap(descs[I2], M01_, N01_);
-
-                        if(GridwiseGemm::CheckValidity(
-                               descs[I0], descs[I1], descs[I2], block_2_ctile_map))
-                        {
-                            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_.push_back(
-                                GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(
-                                    descs[I2]));
-
-                            block_2_ctile_map_container_.push_back(block_2_ctile_map);
-                        }
                     }
                 }
             }
@@ -1242,11 +1193,6 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
         std::vector<AGridDesc_K0_M_K1> a_grid_desc_k0_m_k1_container_;
         std::vector<BGridDesc_K0_N_K1> b_grid_desc_k0_n_k1_container_;
         std::vector<CGridDesc_M_N> c_grid_desc_m_n_container_;
-        std::vector<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>
-            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_;
-        std::vector<typename GridwiseGemm::DefaultBlock2CTileMap> block_2_ctile_map_container_;
-        index_t M01_;
-        index_t N01_;
         OutElementwiseOperation a_element_op_;
         WeiElementwiseOperation b_element_op_;
         InElementwiseOperation c_element_op_;
@@ -1276,123 +1222,84 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
             {
 #if DEBUG_LOG
                 {
-                    std::cout << "arg.a_grid_desc_k0_m_k1_container_{"
+                    std::cout << "arg.a_grid_desc_k0_m_k1{"
                               << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I0) << ", "
                               << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I1) << ", "
                               << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I2) << "}"
                               << std::endl;
 
-                    std::cout << "arg.b_grid_desc_k0_n_k1_container_{"
+                    std::cout << "arg.b_grid_desc_k0_n_k1{"
                               << arg.b_grid_desc_k0_n_k1_container_[i].GetLength(I0) << ", "
                               << arg.b_grid_desc_k0_n_k1_container_[i].GetLength(I1) << ", "
                               << arg.b_grid_desc_k0_n_k1_container_[i].GetLength(I2) << "}"
                               << std::endl;
 
-                    std::cout << "arg.c_grid_desc_m_n_container_{ "
+                    std::cout << "arg.c_grid_desc_m_n{"
                               << arg.c_grid_desc_m_n_container_[i].GetLength(I0) << ", "
                               << arg.c_grid_desc_m_n_container_[i].GetLength(I1) << "}"
                               << std::endl;
-
-                    std::cout << "arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I0)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I1)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I2)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I3)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I4)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I5)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I6)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I7)
-                              << " ) " << std::endl;
                 }
 #endif
 
                 if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
                                                 arg.b_grid_desc_k0_n_k1_container_[i],
-                                                arg.c_grid_desc_m_n_container_[i],
-                                                arg.block_2_ctile_map_container_[i]))
+                                                arg.c_grid_desc_m_n_container_[i]))
                 {
                     throw std::runtime_error(
-                        "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r1 has invalid setting");
+                        "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
                 }
 
-                const index_t grid_size = arg.block_2_ctile_map_container_[i].CalculateGridSize(
-                    arg.c_grid_desc_m_n_container_[i]);
+                const auto [gdx, gdy, gdz] =
+                    GridwiseGemm::CalculateGridSize(arg.c_grid_desc_m_n_container_[i]);
 
                 const auto K = arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I0) *
                                arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I2);
 
                 if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r3<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                        remove_reference_t<
-                            typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                        OutElementwiseOperation,
-                        WeiElementwiseOperation,
-                        InElementwiseOperation,
-                        remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                        true>;
-
-                    ave_time += launch_and_time_kernel(
-                        stream_config,
-                        kernel,
-                        dim3(grid_size),
-                        dim3(BlockSize),
-                        0,
-                        arg.p_a_grid_,
-                        arg.p_b_grid_,
-                        arg.p_c_grid_,
-                        arg.a_grid_desc_k0_m_k1_container_[i],
-                        arg.b_grid_desc_k0_n_k1_container_[i],
-                        arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i],
-                        arg.a_element_op_,
-                        arg.b_element_op_,
-                        arg.c_element_op_,
-                        arg.block_2_ctile_map_container_[i]);
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r3<GridwiseGemm,
+                                                ADataType, // TODO: distiguish A/B datatype
+                                                CDataType,
+                                                DeviceOp::AGridDesc_K0_M_K1,
+                                                DeviceOp::BGridDesc_K0_N_K1,
+                                                DeviceOp::CGridDesc_M_N,
+                                                true>;
+
+                    ave_time += launch_and_time_kernel(stream_config,
+                                                       kernel,
+                                                       dim3(gdx, gdy, gdz),
+                                                       dim3(BlockSize),
+                                                       0,
+                                                       arg.p_a_grid_,
+                                                       arg.p_b_grid_,
+                                                       arg.p_c_grid_,
+                                                       arg.a_grid_desc_k0_m_k1_container_[i],
+                                                       arg.b_grid_desc_k0_n_k1_container_[i],
+                                                       arg.c_grid_desc_m_n_container_[i]);
                 }
                 else
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r3<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                        remove_reference_t<
-                            typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                        OutElementwiseOperation,
-                        WeiElementwiseOperation,
-                        InElementwiseOperation,
-                        remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                        false>;
-
-                    ave_time += launch_and_time_kernel(
-                        stream_config,
-                        kernel,
-                        dim3(grid_size),
-                        dim3(BlockSize),
-                        0,
-                        arg.p_a_grid_,
-                        arg.p_b_grid_,
-                        arg.p_c_grid_,
-                        arg.a_grid_desc_k0_m_k1_container_[i],
-                        arg.b_grid_desc_k0_n_k1_container_[i],
-                        arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i],
-                        arg.a_element_op_,
-                        arg.b_element_op_,
-                        arg.c_element_op_,
-                        arg.block_2_ctile_map_container_[i]);
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r3<GridwiseGemm,
+                                                ADataType, // TODO: distiguish A/B datatype
+                                                CDataType,
+                                                DeviceOp::AGridDesc_K0_M_K1,
+                                                DeviceOp::BGridDesc_K0_N_K1,
+                                                DeviceOp::CGridDesc_M_N,
+                                                false>;
+
+                    ave_time += launch_and_time_kernel(stream_config,
+                                                       kernel,
+                                                       dim3(gdx, gdy, gdz),
+                                                       dim3(BlockSize),
+                                                       0,
+                                                       arg.p_a_grid_,
+                                                       arg.p_b_grid_,
+                                                       arg.p_c_grid_,
+                                                       arg.a_grid_desc_k0_m_k1_container_[i],
+                                                       arg.b_grid_desc_k0_n_k1_container_[i],
+                                                       arg.c_grid_desc_m_n_container_[i]);
                 }
             }
             return ave_time;
@@ -1446,8 +1353,7 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
                                             arg.b_grid_desc_k0_n_k1_container_[i],
-                                            arg.c_grid_desc_m_n_container_[i],
-                                            arg.block_2_ctile_map_container_[i]))
+                                            arg.c_grid_desc_m_n_container_[i]))
             {
                 return false;
             }
@@ -1472,10 +1378,7 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                              std::vector<ck::index_t> conv_filter_strides,
                              std::vector<ck::index_t> conv_filter_dilations,
                              std::vector<ck::index_t> input_left_pads,
-                             std::vector<ck::index_t> input_right_pads,
-                             InElementwiseOperation in_element_op,
-                             WeiElementwiseOperation wei_element_op,
-                             OutElementwiseOperation out_element_op)
+                             std::vector<ck::index_t> input_right_pads)
     {
         return Argument{p_in_grid,
                         p_wei_grid,
@@ -1489,12 +1392,7 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                         conv_filter_strides,
                         conv_filter_dilations,
                         input_left_pads,
-                        input_right_pads,
-                        1,
-                        1,
-                        in_element_op,
-                        wei_element_op,
-                        out_element_op};
+                        input_right_pads};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -1513,9 +1411,9 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                         std::vector<ck::index_t> conv_filter_dilations,
                         std::vector<ck::index_t> input_left_pads,
                         std::vector<ck::index_t> input_right_pads,
-                        InElementwiseOperation in_element_op,
-                        WeiElementwiseOperation wei_element_op,
-                        OutElementwiseOperation out_element_op) override
+                        InElementwiseOperation,
+                        WeiElementwiseOperation,
+                        OutElementwiseOperation) override
     {
         return std::make_unique<Argument>(static_cast<InDataType*>(p_in_grid),
                                           static_cast<const WeiDataType*>(p_wei_grid),
@@ -1529,12 +1427,7 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                                           conv_filter_strides,
                                           conv_filter_dilations,
                                           input_left_pads,
-                                          input_right_pads,
-                                          1,
-                                          1,
-                                          in_element_op,
-                                          wei_element_op,
-                                          out_element_op);
+                                          input_right_pads);
     }
 
     std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
index 528d9bf42..ebbb1d38c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
@@ -75,132 +75,20 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
 
     static constexpr auto K1Number = Number<K1>{};
 
-    static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
-    {
-        const index_t K0 = K / K1;
-
-        const auto a_grid_desc_m_k = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
-            }
-        }();
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-
-            return transform_tensor_descriptor(
-                a_grid_desc_m_k,
-                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                           make_right_pad_transform(M, PadM)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                a_grid_desc_m_k,
-                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                           make_pass_through_transform(M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-        }
-    }
-
-    static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
-    {
-        const index_t K0 = K / K1;
-
-        const auto b_grid_desc_k_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
-            }
-        }();
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-
-            return transform_tensor_descriptor(
-                b_grid_desc_k_n,
-                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                           make_right_pad_transform(N, PadN)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                b_grid_desc_k_n,
-                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                           make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-        }
-    }
-
-    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
-    {
-        const auto c_grid_desc_m_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
-            }
-        }();
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-    }
-
-    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1));
-    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1));
-    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
-
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext<
         BlockSize,
         ADataType, // TODO: distinguish A/B datatype
         AccDataType,
         CDataType,
         InMemoryDataOperationEnum::Set,
-        AGridDesc_K0_M_K1,
-        BGridDesc_K0_N_K1,
-        CGridDesc_M_N,
+        ALayout,
+        BLayout,
+        CLayout,
         AElementwiseOperation,
         BElementwiseOperation,
         CElementwiseOperation,
+        GemmSpec,
         MPerBlock,
         NPerBlock,
         K0PerBlock,
@@ -232,173 +120,41 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
         LoopSched,
         PipelineVer>;
 
-    // Argument
-    struct Argument : public BaseArgument
-    {
-        Argument(const ADataType* p_a_grid,
-                 const BDataType* p_b_grid,
-                 CDataType* p_c_grid,
-                 index_t M,
-                 index_t N,
-                 index_t K,
-                 index_t StrideA,
-                 index_t StrideB,
-                 index_t StrideC,
-                 index_t M01,
-                 index_t N01,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-            : p_a_grid_{p_a_grid},
-              p_b_grid_{p_b_grid},
-              p_c_grid_{p_c_grid},
-              a_grid_desc_k0_m_k1_{},
-              b_grid_desc_k0_n_k1_{},
-              c_grid_desc_m_n_{},
-              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
-              block_2_ctile_map_{},
-              M01_{M01},
-              N01_{N01},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op},
-              kraw_{K}
-        {
-            a_grid_desc_k0_m_k1_ = DeviceGemmXdl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA);
-            b_grid_desc_k0_n_k1_ = DeviceGemmXdl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB);
-            c_grid_desc_m_n_     = DeviceGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC);
-
-            block_2_ctile_map_ =
-                GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
-
-            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
-                                           b_grid_desc_k0_n_k1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ =
-                    GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n_);
-            }
-        }
-
-        //  private:
-        const ADataType* p_a_grid_;
-        const BDataType* p_b_grid_;
-        CDataType* p_c_grid_;
-        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
-        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
-        CGridDesc_M_N c_grid_desc_m_n_;
-        typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2
-            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
-        index_t M01_;
-        index_t N01_;
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
-        index_t kraw_;
-    };
+    using Argument = typename GridwiseGemm::Argument;
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        using Argument = DeviceGemmXdl::Argument;
-
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        float Run(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
         {
-#if DEBUG_LOG
+            if(stream_config.log_level_ > 0)
             {
-                std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
-                          << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
-                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0)
-                          << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
-                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
-                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+                karg.Print();
             }
-#endif
 
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                            arg.b_grid_desc_k0_n_k1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
+            if(!GridwiseGemm::CheckValidity(karg))
             {
                 throw std::runtime_error(
-                    "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
+                    "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext has invalid setting");
             }
 
-            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
-
-            const auto K =
-                arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
+            const auto [gdx, gdy, gdz] = GridwiseGemm::CalculateGridSize(karg.M, karg.N);
 
             float ave_time = 0;
 
-            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(karg.K))
             {
-                const auto kernel = kernel_gemm_xdlops_v2r3<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    remove_reference_t<DeviceGemmXdl::AGridDesc_K0_M_K1>,
-                    remove_reference_t<DeviceGemmXdl::BGridDesc_K0_N_K1>,
-                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                    true>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(BlockSize),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.a_grid_desc_k0_m_k1_,
-                                                  arg.b_grid_desc_k0_n_k1_,
-                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                                  arg.a_element_op_,
-                                                  arg.b_element_op_,
-                                                  arg.c_element_op_,
-                                                  arg.block_2_ctile_map_);
+                const auto kernel = kernel_gemm_xdlops_v2r3<GridwiseGemm, true>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg);
             }
             else
             {
-                const auto kernel = kernel_gemm_xdlops_v2r3<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    remove_reference_t<DeviceGemmXdl::AGridDesc_K0_M_K1>,
-                    remove_reference_t<DeviceGemmXdl::BGridDesc_K0_N_K1>,
-                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                    false>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(BlockSize),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.a_grid_desc_k0_m_k1_,
-                                                  arg.b_grid_desc_k0_n_k1_,
-                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                                  arg.a_element_op_,
-                                                  arg.b_element_op_,
-                                                  arg.c_element_op_,
-                                                  arg.block_2_ctile_map_);
+                const auto kernel = kernel_gemm_xdlops_v2r3<GridwiseGemm, false>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg);
             }
 
             return ave_time;
@@ -418,7 +174,7 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
         return true;
     }
 
-    static bool IsSupportedArgument(const Argument& arg)
+    static bool IsSupportedArgument(const Argument& karg)
     {
         if(ck::get_device_name() == "gfx908")
         {
@@ -441,15 +197,12 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
             return false;
         }
 
-        if(arg.kraw_ % K1 != 0)
+        if(karg.K % K1 != 0)
         {
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                           arg.b_grid_desc_k0_n_k1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        return GridwiseGemm::CheckValidity(karg);
     }
 
     // polymorphic
@@ -467,24 +220,11 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
                              index_t StrideA,
                              index_t StrideB,
                              index_t StrideC,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CElementwiseOperation)
     {
-        return Argument{p_a,
-                        p_b,
-                        p_c,
-                        M,
-                        N,
-                        K,
-                        StrideA,
-                        StrideB,
-                        StrideC,
-                        1,
-                        1,
-                        a_element_op,
-                        b_element_op,
-                        c_element_op};
+        return Argument{p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -499,9 +239,9 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
                                                       index_t StrideA,
                                                       index_t StrideB,
                                                       index_t StrideC,
-                                                      AElementwiseOperation a_element_op,
-                                                      BElementwiseOperation b_element_op,
-                                                      CElementwiseOperation c_element_op) override
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation) override
     {
         return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                           static_cast<const BDataType*>(p_b),
@@ -511,12 +251,7 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
                                           K,
                                           StrideA,
                                           StrideB,
-                                          StrideC,
-                                          1,
-                                          1,
-                                          a_element_op,
-                                          b_element_op,
-                                          c_element_op);
+                                          StrideC);
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
index 91ed1112f..5c8b9f419 100644
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -134,6 +134,14 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
     {
     }
 
+    template <typename CGridDesc_M_N>
+    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(const CGridDesc_M_N& c_grid_desc_m_n,
+                                                        index_t M01 = 8)
+        : BlockToCTileMap_M00_N0_M01Adapt(
+              c_grid_desc_m_n.GetLength(I0), c_grid_desc_m_n.GetLength(I1), M01)
+    {
+    }
+
     __host__ static constexpr index_t CalculateGridSize(index_t M, index_t N)
     {
         const auto M0 = math::integer_divide_ceil(M, MPerBlock);
@@ -142,6 +150,18 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
         return M0 * N0;
     }
 
+    template <typename CGridDesc_M_N>
+    __host__ static constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        return CalculateGridSize(c_grid_desc_m_n.GetLength(I0), c_grid_desc_m_n.GetLength(I1));
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const
+    {
+        return true;
+    }
+
     template <typename TopIdx>
     __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
     {
@@ -222,30 +242,12 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
     index_t M01_;
 };
 
+// keep the redundant type argument for backward compatibility
 template <index_t MPerBlock, index_t NPerBlock, typename CGridDesc_M_N>
 struct BlockToCTileMap_M00_N0_M01Adapt : BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
 {
-    using Parent = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>;
-
-    using Parent::I0;
-    using Parent::I1;
-
-    using Parent::Parent;
-    using Parent::operator=;
-
-    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(const CGridDesc_M_N& c_grid_desc_m_n,
-                                                        index_t M01 = 8)
-        : Parent(c_grid_desc_m_n.GetLength(I0), c_grid_desc_m_n.GetLength(I1), M01)
-    {
-    }
-
-    __host__ static constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n)
-    {
-        return Parent::CalculateGridSize(c_grid_desc_m_n.GetLength(I0),
-                                         c_grid_desc_m_n.GetLength(I1));
-    }
-
-    __host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const { return true; }
+    using BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>::
+        BlockToCTileMap_M00_N0_M01Adapt;
 };
 
 // 2D slices of column-vectors in 3D space
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
index f4504a940..bd18fdb10 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -7,6 +7,7 @@
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
@@ -21,27 +22,18 @@ template <typename GridwiseGemm,
           typename FloatC,
           typename AGridDesc_K0_M_K1,
           typename BGridDesc_K0_N_K1,
-          typename CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation,
-          typename Block2CTileMap,
+          typename CGridDesc_M_N,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_v2r3(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
-            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
-            const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op,
-            const Block2CTileMap block_2_ctile_map)
+        kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
+                                const FloatAB* __restrict__ p_b_grid,
+                                FloatC* __restrict__ p_c_grid,
+                                const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+                                const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
+                                const CGridDesc_M_N c_grid_desc_m_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx940__))
@@ -53,22 +45,46 @@ __global__ void
                                                   p_shared,
                                                   a_grid_desc_k0_m_k1,
                                                   b_grid_desc_k0_n_k1,
-                                                  c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  block_2_ctile_map);
+                                                  c_grid_desc_m_n);
 #else
     ignore                = p_a_grid;
     ignore                = p_b_grid;
     ignore                = p_c_grid;
     ignore                = a_grid_desc_k0_m_k1;
     ignore                = b_grid_desc_k0_n_k1;
-    ignore                = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2;
-    ignore                = a_element_op;
-    ignore                = b_element_op;
-    ignore                = c_element_op;
-    ignore                = block_2_ctile_map;
+    ignore                = c_grid_desc_m_n;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <typename GridwiseGemm, bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdlops_v2r3(const typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    const auto a_grid_desc_k0_m_k1 =
+        amd_wave_read_first_lane(GridwiseGemm::MakeAGridDescriptor_K0_M_K1(
+            karg.M, karg.MPadded, karg.K, karg.K0, karg.StrideA));
+    const auto b_grid_desc_k0_n_k1 =
+        amd_wave_read_first_lane(GridwiseGemm::MakeBGridDescriptor_K0_N_K1(
+            karg.K, karg.N, karg.NPadded, karg.K0, karg.StrideB));
+    const auto c_grid_desc_m_n = amd_wave_read_first_lane(GridwiseGemm::MakeCGridDescriptor_M_N(
+        karg.M, karg.MPadded, karg.N, karg.NPadded, karg.StrideC));
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(karg.p_a_grid,
+                                                  karg.p_b_grid,
+                                                  karg.p_c_grid,
+                                                  p_shared,
+                                                  a_grid_desc_k0_m_k1,
+                                                  b_grid_desc_k0_n_k1,
+                                                  c_grid_desc_m_n);
+#else
+    ignore                = karg;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 
@@ -77,9 +93,6 @@ template <index_t BlockSize,
           typename FloatAcc,
           typename FloatC,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AGridDesc_K0_M_K1,
-          typename BGridDesc_K0_N_K1,
-          typename CGridDesc_M_N,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
@@ -129,6 +142,105 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
+    __host__ static auto CalculateGridSize(index_t M, index_t N)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, 1);
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ static auto CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(c_grid_desc_m_n), 1, 1);
+    }
+
+    template <typename>
+    __host__ static auto CalculateGridSize(index_t M, index_t N)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, 1);
+    }
+
+    __host__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock) * MPerBlock;
+    }
+
+    __host__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock) * NPerBlock;
+    }
+
+    __host__ static auto CalculateK0(index_t K) { return math::integer_divide_floor(K, K1Value); }
+
+    // Argument
+    struct Problem
+    {
+        __host__ Problem(index_t M_,
+                         index_t N_,
+                         index_t K_,
+                         index_t StrideA_,
+                         index_t StrideB_,
+                         index_t StrideC_)
+            : M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideB{StrideB_},
+              StrideC{StrideC_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              K0{CalculateK0(K)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "K0:" << K0 << "}" << std::endl;
+        }
+
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        index_t StrideC;
+        index_t MPadded;
+        index_t NPadded;
+        index_t K0;
+    };
+
+    // Argument
+    struct Argument : public Problem, public tensor_operation::device::BaseArgument
+    {
+        __host__ Argument(const FloatAB* p_a_grid_,
+                          const FloatAB* p_b_grid_,
+                          FloatC* p_c_grid_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideB_,
+                          index_t StrideC_)
+            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_},
+              p_a_grid{p_a_grid_},
+              p_b_grid{p_b_grid_},
+              p_c_grid{p_c_grid_}
+        {
+        }
+
+        const FloatAB* p_a_grid;
+        const FloatAB* p_b_grid;
+        FloatC* p_c_grid;
+    };
+
     using GridwiseGemmPipe = remove_cvref_t<decltype(
         GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>())>;
 
@@ -204,13 +316,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         return (a_block_space_size_aligned + b_block_space_size_aligned) * sizeof(FloatAB);
     }
 
-    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
-    template <typename Block2CTileMap>
+    template <typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N>
     __host__ __device__ static constexpr bool
     CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
                   const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
-                  const CGridDesc_M_N& c_grid_desc_m_n,
-                  const Block2CTileMap& block_2_ctile_map)
+                  const CGridDesc_M_N& c_grid_desc_m_n)
     {
         static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
                       "wrong! K1 need to be known at compile-time");
@@ -239,7 +349,24 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             return false;
         }
 
-        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ static constexpr bool CheckValidity(const Problem& problem)
+    {
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+
+        static_assert((MPerBlock % (MPerXDL * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXDL)) == 0,
+                      "Invalid tuning param!");
+
+        // check gridwise gemm pipeline
+        const index_t K0      = problem.K / K1Value;
+        const auto num_k_loop = K0 / K0PerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
         {
             return false;
         }
@@ -248,15 +375,16 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         return true;
     }
 
-    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
     {
         const index_t num_loop = K / (K0PerBlock * K1);
 
         return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
     }
 
+    template <typename CGridDesc>
     __host__ __device__ static constexpr auto
-    MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N& c_grid_desc_m_n)
+    MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc& c_grid_desc_m_n)
     {
         constexpr auto max_lds_align = K1;
 
@@ -306,31 +434,23 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
     }
 
     // return block_id to C matrix tile idx (m0, n0) mapping
-    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap(
-        const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */)
+    using Block2CTileMap = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
+
+    template <bool HasMainKBlockLoop,
+              typename AGridDesc_K0_M_K1,
+              typename BGridDesc_K0_N_K1,
+              typename CGridDesc_M_N>
+    __device__ static void Run(const FloatAB* p_a_grid,
+                               const FloatAB* p_b_grid,
+                               FloatC* p_c_grid,
+                               void* __restrict__ p_shared,
+                               const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
+                               const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
+                               const CGridDesc_M_N& c_grid_desc_m_n)
     {
-        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
-            c_grid_desc_m_n);
-    }
+        const auto c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+            MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n);
 
-    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =
-        decltype(MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(CGridDesc_M_N{}));
-    using DefaultBlock2CTileMap = decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1));
-
-    template <bool HasMainKBlockLoop, typename Block2CTileMap = DefaultBlock2CTileMap>
-    __device__ static void
-    Run(const FloatAB* __restrict__ p_a_grid,
-        const FloatAB* __restrict__ p_b_grid,
-        FloatC* __restrict__ p_c_grid,
-        void* __restrict__ p_shared,
-        const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
-        const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
-        const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2& c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-        const AElementwiseOperation& a_element_op,
-        const BElementwiseOperation& b_element_op,
-        const CElementwiseOperation& c_element_op,
-        const Block2CTileMap& block_2_ctile_map)
-    {
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -338,7 +458,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_c_grid, c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetElementSpaceSize());
 
-        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+        const AElementwiseOperation a_element_op{};
+        const BElementwiseOperation b_element_op{};
+        const CElementwiseOperation c_element_op{};
+
+        const auto block_2_ctile_map =
+            Block2CTileMap{c_grid_desc_m_n.GetLength(I0), c_grid_desc_m_n.GetLength(I1)};
 
         // divide block work by [M, N]
         const auto block_work_idx =
@@ -467,6 +592,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
 
         // gridwise GEMM pipeline
+        const auto K0                       = a_grid_desc_k0_m_k1.GetLength(I0);
         const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock);
 
         GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_k0_m_k1,
@@ -565,4 +691,309 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
     }
 };
 
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t K0PerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t K1Value,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_K1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          bool BBlockLdsExtraN,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          index_t NumGemmKPrefetchStage = 1,
+          LoopScheduler LoopSched       = make_default_loop_scheduler(),
+          PipelineVersion PipelineVer   = PipelineVersion::v1>
+struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext
+    : GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                              FloatAB,
+                                              FloatAcc,
+                                              FloatC,
+                                              CGlobalMemoryDataOperation,
+                                              AElementwiseOperation,
+                                              BElementwiseOperation,
+                                              CElementwiseOperation,
+                                              MPerBlock,
+                                              NPerBlock,
+                                              K0PerBlock,
+                                              MPerXDL,
+                                              NPerXDL,
+                                              K1Value,
+                                              MXdlPerWave,
+                                              NXdlPerWave,
+                                              ABlockTransferThreadClusterLengths_K0_M_K1,
+                                              ABlockTransferThreadClusterArrangeOrder,
+                                              ABlockTransferSrcAccessOrder,
+                                              ABlockTransferSrcVectorDim,
+                                              ABlockTransferSrcScalarPerVector,
+                                              ABlockTransferDstScalarPerVector_K1,
+                                              AThreadTransferSrcResetCoordinateAfterRun,
+                                              ABlockLdsExtraM,
+                                              BBlockTransferThreadClusterLengths_K0_N_K1,
+                                              BBlockTransferThreadClusterArrangeOrder,
+                                              BBlockTransferSrcAccessOrder,
+                                              BBlockTransferSrcVectorDim,
+                                              BBlockTransferSrcScalarPerVector,
+                                              BBlockTransferDstScalarPerVector_K1,
+                                              BThreadTransferSrcResetCoordinateAfterRun,
+                                              BBlockLdsExtraN,
+                                              CThreadTransferSrcDstAccessOrder,
+                                              CThreadTransferSrcDstVectorDim,
+                                              CThreadTransferDstScalarPerVector,
+                                              NumGemmKPrefetchStage,
+                                              LoopSched,
+                                              PipelineVer>
+{
+    using Parent =
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                CGlobalMemoryDataOperation,
+                                                AElementwiseOperation,
+                                                BElementwiseOperation,
+                                                CElementwiseOperation,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                K0PerBlock,
+                                                MPerXDL,
+                                                NPerXDL,
+                                                K1Value,
+                                                MXdlPerWave,
+                                                NXdlPerWave,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                ABlockLdsExtraM,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockLdsExtraN,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                NumGemmKPrefetchStage,
+                                                LoopSched,
+                                                PipelineVer>;
+
+    using typename Parent::GridwiseGemmPipe;
+    using typename Parent::Problem;
+
+    using Parent::I1;
+
+    using Parent::K1;
+
+    __device__ static auto
+    MakeAGridDescriptor_K0_M_K1(index_t M, index_t MPad, index_t K, index_t K0, index_t StrideA)
+    {
+        const auto a_grid_desc_m_k = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
+        {
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+    }
+
+    __device__ static auto
+    MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t NPad, index_t K0, index_t StrideB)
+    {
+        const auto b_grid_desc_k_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
+            }
+        }();
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
+        {
+            return transform_tensor_descriptor(
+                b_grid_desc_k_n,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                b_grid_desc_k_n,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+    }
+
+    __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
+        {
+            return transform_tensor_descriptor(c_grid_desc_m_n,
+                                               make_tuple(make_right_pad_transform(M, MPad - M),
+                                                          make_right_pad_transform(N, NPad - N)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
+    __host__ static constexpr bool CheckValidity(const Problem& problem)
+    {
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+
+        static_assert((MPerBlock % (MPerXDL * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXDL)) == 0,
+                      "Invalid tuning param!");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(problem.M % MPerBlock == 0))
+            {
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(problem.N % NPerBlock == 0))
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(problem.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(problem.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(problem.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(problem.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+
+        // check gridwise gemm pipeline
+        const index_t K0      = problem.K / K1;
+        const auto num_k_loop = K0 / K0PerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+};
+
 } // namespace ck
-- 
GitLab


From e2ebc8e79599531e2b837c27b8aaa0404c5c58ce Mon Sep 17 00:00:00 2001
From: who who who <fsx950223@outlook.com>
Date: Fri, 2 Jun 2023 05:23:41 +0800
Subject: [PATCH 50/71] replace hipMemcpy with hipMemcpyWithStream (#734)

---
 ...grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp |  9 +++++----
 ...ce_grouped_contraction_multiple_d_xdl_cshuffle.hpp | 11 ++++++-----
 .../device/impl/device_grouped_gemm_multiple_d_dl.hpp |  9 +++++----
 .../gpu/device/impl/device_grouped_gemm_xdl.hpp       |  9 +++++----
 .../impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp  |  9 +++++----
 5 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index 9c8b5f462..a2940fa75 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -611,10 +611,11 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
                 some_has_main_k_block_loop |= y;
             }
 
-            hipGetErrorString(hipMemcpy(arg.p_workspace_,
-                                        arg.group_kernel_args_.data(),
-                                        arg.group_kernel_args_.size() * sizeof(GroupKernelArg),
-                                        hipMemcpyHostToDevice));
+            hipGetErrorString(hipMemcpyWithStream(arg.p_workspace_,
+                                                  arg.group_kernel_args_.data(),
+                                                  arg.group_kernel_args_.size() * sizeof(GroupKernelArg),
+                                                  hipMemcpyHostToDevice,
+                                                  stream_config.stream_id_));
 
             float ave_time = 0;
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
index a275ee102..efe3a69ac 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
@@ -652,11 +652,12 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
                 }
             }
 
-            hipGetErrorString(hipMemcpy(arg.p_workspace_,
-                                        arg.contraction_multi_d_kernel_args_.data(),
-                                        arg.contraction_multi_d_kernel_args_.size() *
-                                            sizeof(ContractionMultiDKernelArg),
-                                        hipMemcpyHostToDevice));
+            hipGetErrorString(hipMemcpyWithStream(arg.p_workspace_,
+                                                  arg.contraction_multi_d_kernel_args_.data(),
+                                                  arg.contraction_multi_d_kernel_args_.size() *
+                                                  sizeof(ContractionMultiDKernelArg),
+                                                  hipMemcpyHostToDevice,
+                                                  stream_config.stream_id_));
 
             float ave_time = 0;
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
index d1f1b7fcc..2a3e2b6cf 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
@@ -597,10 +597,11 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
                 }
             }
 
-            hipGetErrorString(hipMemcpy(arg.p_workspace_,
-                                        arg.gemm_desc_kernel_arg_.data(),
-                                        arg.gemm_desc_kernel_arg_.size() * sizeof(GemmKernelArg),
-                                        hipMemcpyHostToDevice));
+            hipGetErrorString(hipMemcpyWithStream(arg.p_workspace_,
+                                                  arg.gemm_desc_kernel_arg_.data(),
+                                                  arg.gemm_desc_kernel_arg_.size() * sizeof(GemmKernelArg),
+                                                  hipMemcpyHostToDevice,
+                                                  stream_config.stream_id_));
 
             auto launch_kernel = [&](auto has_main_k_block_loop,
                                      auto has_double_tail_k_block_loop) {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
index 77a8e6ecb..e5bf395be 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -549,10 +549,11 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
             }
 
             hipGetErrorString(
-                hipMemcpy(arg.p_workspace_,
-                          arg.gemm_desc_kernel_arg_.data(),
-                          arg.gemm_desc_kernel_arg_.size() * sizeof(GemmBiasTransKernelArg),
-                          hipMemcpyHostToDevice));
+                hipMemcpyWithStream(arg.p_workspace_,
+                                    arg.gemm_desc_kernel_arg_.data(),
+                                    arg.gemm_desc_kernel_arg_.size() * sizeof(GemmBiasTransKernelArg),
+                                    hipMemcpyHostToDevice,
+                                    stream_config.stream_id_));
 
             float ave_time = 0;
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index cd39cc983..ebeea71db 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -406,10 +406,11 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 }
             }
 
-            hip_check_error(hipMemcpy(arg.p_workspace_,
-                                      arg.gemm_kernel_args_.data(),
-                                      arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
-                                      hipMemcpyHostToDevice));
+            hip_check_error(hipMemcpyWithStream(arg.p_workspace_,
+                                                arg.gemm_kernel_args_.data(),
+                                                arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
+                                                hipMemcpyHostToDevice,
+                                                stream_config.stream_id_));
 
             float ave_time = 0;
 
-- 
GitLab


From 403659040105e6877cba1d454ccfd85bece7fc09 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 2 Jun 2023 14:10:02 -0700
Subject: [PATCH 51/71] fix clang format (#740)

---
 ...rouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp | 11 ++++++-----
 ...e_grouped_contraction_multiple_d_xdl_cshuffle.hpp |  2 +-
 .../impl/device_grouped_gemm_multiple_d_dl.hpp       | 11 ++++++-----
 .../gpu/device/impl/device_grouped_gemm_xdl.hpp      | 12 ++++++------
 .../impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp | 11 ++++++-----
 5 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index a2940fa75..30e29cc8e 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -611,11 +611,12 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
                 some_has_main_k_block_loop |= y;
             }
 
-            hipGetErrorString(hipMemcpyWithStream(arg.p_workspace_,
-                                                  arg.group_kernel_args_.data(),
-                                                  arg.group_kernel_args_.size() * sizeof(GroupKernelArg),
-                                                  hipMemcpyHostToDevice,
-                                                  stream_config.stream_id_));
+            hipGetErrorString(
+                hipMemcpyWithStream(arg.p_workspace_,
+                                    arg.group_kernel_args_.data(),
+                                    arg.group_kernel_args_.size() * sizeof(GroupKernelArg),
+                                    hipMemcpyHostToDevice,
+                                    stream_config.stream_id_));
 
             float ave_time = 0;
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
index efe3a69ac..5775ff397 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
@@ -655,7 +655,7 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
             hipGetErrorString(hipMemcpyWithStream(arg.p_workspace_,
                                                   arg.contraction_multi_d_kernel_args_.data(),
                                                   arg.contraction_multi_d_kernel_args_.size() *
-                                                  sizeof(ContractionMultiDKernelArg),
+                                                      sizeof(ContractionMultiDKernelArg),
                                                   hipMemcpyHostToDevice,
                                                   stream_config.stream_id_));
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
index 2a3e2b6cf..22be58259 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
@@ -597,11 +597,12 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
                 }
             }
 
-            hipGetErrorString(hipMemcpyWithStream(arg.p_workspace_,
-                                                  arg.gemm_desc_kernel_arg_.data(),
-                                                  arg.gemm_desc_kernel_arg_.size() * sizeof(GemmKernelArg),
-                                                  hipMemcpyHostToDevice,
-                                                  stream_config.stream_id_));
+            hipGetErrorString(
+                hipMemcpyWithStream(arg.p_workspace_,
+                                    arg.gemm_desc_kernel_arg_.data(),
+                                    arg.gemm_desc_kernel_arg_.size() * sizeof(GemmKernelArg),
+                                    hipMemcpyHostToDevice,
+                                    stream_config.stream_id_));
 
             auto launch_kernel = [&](auto has_main_k_block_loop,
                                      auto has_double_tail_k_block_loop) {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
index e5bf395be..390004756 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -548,12 +548,12 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
                 }
             }
 
-            hipGetErrorString(
-                hipMemcpyWithStream(arg.p_workspace_,
-                                    arg.gemm_desc_kernel_arg_.data(),
-                                    arg.gemm_desc_kernel_arg_.size() * sizeof(GemmBiasTransKernelArg),
-                                    hipMemcpyHostToDevice,
-                                    stream_config.stream_id_));
+            hipGetErrorString(hipMemcpyWithStream(arg.p_workspace_,
+                                                  arg.gemm_desc_kernel_arg_.data(),
+                                                  arg.gemm_desc_kernel_arg_.size() *
+                                                      sizeof(GemmBiasTransKernelArg),
+                                                  hipMemcpyHostToDevice,
+                                                  stream_config.stream_id_));
 
             float ave_time = 0;
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index ebeea71db..1ac9969f8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -406,11 +406,12 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 }
             }
 
-            hip_check_error(hipMemcpyWithStream(arg.p_workspace_,
-                                                arg.gemm_kernel_args_.data(),
-                                                arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
-                                                hipMemcpyHostToDevice,
-                                                stream_config.stream_id_));
+            hip_check_error(
+                hipMemcpyWithStream(arg.p_workspace_,
+                                    arg.gemm_kernel_args_.data(),
+                                    arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
+                                    hipMemcpyHostToDevice,
+                                    stream_config.stream_id_));
 
             float ave_time = 0;
 
-- 
GitLab


From 1dd455d6337cf7ed3ec6cb82a1618701a5c17351 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 7 Jun 2023 09:35:14 -0700
Subject: [PATCH 52/71] Update docker (#744)

* update dockerfile to build rocm5.6 rc3

* fix couple of docker issues
---
 Dockerfile | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 8e6ddb1eb..710db05c0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 FROM ubuntu:20.04
-
+ARG DEBIAN_FRONTEND=noninteractive
 ARG ROCMVERSION=5.6
 ARG compiler_version=""
 ARG compiler_commit=""
@@ -9,23 +9,30 @@ RUN set -xe
 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
 RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
 # Add rocm repository
+RUN chmod 1777 /tmp
 RUN apt-get update
-RUN apt-get install -y wget gnupg curl
-RUN --mount=type=ssh if [ "$ROCMVERSION" != "5.6"]; then \
-	wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
+RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
+RUN --mount=type=ssh if [ "$ROCMVERSION" != "5.6" ]; then \
+        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
         sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"; \
-    else sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \
+    elif [ "$ROCMVERSION" = "5.6" ] && [ "$compiler_version" = "" ]; then \
+         sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \
          apt update && apt-get install -y ./amd-nonfree-radeon_20.04-1_all.deb && \
-         amdgpu-repo --amdgpu-build=1567752 --rocm-build=compute-rocm-dkms-no-npi-hipclang/11914 && \
-         DEBIAN_FRONTEND=noninteractive amdgpu-install -y --usecase=rocm ; \
+         amdgpu-repo --amdgpu-build=1567752 --rocm-build=compute-rocm-dkms-no-npi-hipclang/11914; \
+    elif [ "$ROCMVERSION" = "5.6" ] && [ "$compiler_version" = "rc3" ]; then \
+         sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.6-20.04-1_all.deb" && \
+         apt update && apt-get install -y ./amdgpu-install-internal_5.6-20.04-1_all.deb && \
+         sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.6 rel-45  > /etc/apt/sources.list.d/rocm-build.list' && \
+         amdgpu-repo --amdgpu-build=1602498; \
     fi
+RUN amdgpu-install -y --usecase=rocm --no-dkms
+
 RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
 RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
 
 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
-    apt-utils \
     build-essential \
     ccache \
     cmake \
@@ -38,16 +45,11 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     libpthread-stubs0-dev \
     llvm-amdgpu \
     pkg-config \
-    python \
     python3 \
-    python-dev \
     python3-dev \
     python3-pip \
     sshpass \
     software-properties-common \
-    rocm-dev \
-    rocm-device-libs \
-    rocm-cmake \
     vim \
     nano \
     zlib1g-dev \
-- 
GitLab


From 016ebaa7f33d2c3e86cd617210bd636fe7c99b42 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Thu, 8 Jun 2023 20:40:29 +0800
Subject: [PATCH 53/71] support dynamic buffer using memory coherence glc_slc
 bit from template (#725)

---
 include/ck/utility/amd_buffer_addressing.hpp | 284 ++++++++++++-------
 include/ck/utility/dynamic_buffer.hpp        |  23 +-
 2 files changed, 201 insertions(+), 106 deletions(-)

diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp
index ef3d2032c..38ee76d88 100644
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -286,7 +286,22 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
                                        int soffset,    // dst_wave_addr_offset
                                        int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
 
-template <typename T, index_t N>
+// memory coherency bit for buffer store/load instruction
+// check ISA manual for each GFX target
+// e.g. for
+// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
+// page 67~68
+enum struct AmdBufferCoherenceEnum
+{
+    DefaultCoherence = 0, // default value
+    GLC              = 1,
+    SLC              = 2,
+    GLC_SLC          = 3,
+};
+
+template <typename T,
+          index_t N,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
                                                                  index_t src_thread_addr_offset,
                                                                  index_t src_wave_addr_offset)
@@ -305,28 +320,37 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
         // use fp32 load to mimic fp64 load
         if constexpr(N == 1)
         {
-            const float2_t tmp = llvm_amdgcn_raw_buffer_load_fp32x2(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            const float2_t tmp =
+                llvm_amdgcn_raw_buffer_load_fp32x2(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
 
             return bit_cast<double>(tmp);
         }
         else if constexpr(N == 2)
         {
-            const float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            const float4_t tmp =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
 
             return bit_cast<double2_t>(tmp);
         }
         else if constexpr(N == 4)
         {
-            const float4_t f32_0 = llvm_amdgcn_raw_buffer_load_fp32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            const float4_t f32_0 =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
 
             const float4_t f32_1 =
                 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                    src_thread_addr_offset,
                                                    src_wave_addr_offset + 4 * sizeof(float),
-                                                   0);
+                                                   static_cast<index_t>(coherence));
             vector_type<double, 4> tmp;
 
             tmp.AsType<double2_t>()(Number<0>{}) = bit_cast<double2_t>(f32_0);
@@ -339,31 +363,40 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
     {
         if constexpr(N == 1)
         {
-            return llvm_amdgcn_raw_buffer_load_fp32(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_fp32(src_wave_buffer_resource,
+                                                    src_thread_addr_offset,
+                                                    src_wave_addr_offset,
+                                                    static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
-            return llvm_amdgcn_raw_buffer_load_fp32x2(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_fp32x2(src_wave_buffer_resource,
+                                                      src_thread_addr_offset,
+                                                      src_wave_addr_offset,
+                                                      static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
-            return llvm_amdgcn_raw_buffer_load_fp32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                      src_thread_addr_offset,
+                                                      src_wave_addr_offset,
+                                                      static_cast<index_t>(coherence));
         }
         else if constexpr(N == 8)
         {
             vector_type<float, 8> tmp;
 
-            tmp.AsType<float4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            tmp.AsType<float4_t>()(Number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
 
             tmp.AsType<float4_t>()(Number<1>{}) =
                 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                    src_thread_addr_offset,
                                                    src_wave_addr_offset + 4 * sizeof(float),
-                                                   0);
+                                                   static_cast<index_t>(coherence));
 
             return tmp.AsType<float8_t>()(Number<0>{});
         }
@@ -372,24 +405,32 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
     {
         if constexpr(N == 1)
         {
-            return llvm_amdgcn_raw_buffer_load_fp16(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_fp16(src_wave_buffer_resource,
+                                                    src_thread_addr_offset,
+                                                    src_wave_addr_offset,
+                                                    static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
-            return llvm_amdgcn_raw_buffer_load_fp16x2(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_fp16x2(src_wave_buffer_resource,
+                                                      src_thread_addr_offset,
+                                                      src_wave_addr_offset,
+                                                      static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
-            return llvm_amdgcn_raw_buffer_load_fp16x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
+                                                      src_thread_addr_offset,
+                                                      src_wave_addr_offset,
+                                                      static_cast<index_t>(coherence));
         }
         else if constexpr(N == 8)
         {
             // use fp32 load to mimic fp16 load
-            float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                              src_thread_addr_offset,
+                                                              src_wave_addr_offset,
+                                                              static_cast<index_t>(coherence));
 
             return bit_cast<half8_t>(tmp);
         }
@@ -398,23 +439,31 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
     {
         if constexpr(N == 1)
         {
-            return llvm_amdgcn_raw_buffer_load_i16(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
-            return llvm_amdgcn_raw_buffer_load_i16x2(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i16x2(src_wave_buffer_resource,
+                                                     src_thread_addr_offset,
+                                                     src_wave_addr_offset,
+                                                     static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
-            return llvm_amdgcn_raw_buffer_load_i16x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i16x4(src_wave_buffer_resource,
+                                                     src_thread_addr_offset,
+                                                     src_wave_addr_offset,
+                                                     static_cast<index_t>(coherence));
         }
         else if constexpr(N == 8)
         {
-            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                              src_thread_addr_offset,
+                                                              src_wave_addr_offset,
+                                                              static_cast<index_t>(coherence));
 
             return bit_cast<bhalf8_t>(tmp);
         }
@@ -423,31 +472,40 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
     {
         if constexpr(N == 1)
         {
-            return llvm_amdgcn_raw_buffer_load_i32(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
-            return llvm_amdgcn_raw_buffer_load_i32x2(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource,
+                                                     src_thread_addr_offset,
+                                                     src_wave_addr_offset,
+                                                     static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
-            return llvm_amdgcn_raw_buffer_load_i32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                     src_thread_addr_offset,
+                                                     src_wave_addr_offset,
+                                                     static_cast<index_t>(coherence));
         }
         else if constexpr(N == 8)
         {
             vector_type<int32_t, 8> tmp;
 
-            tmp.AsType<int32x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            tmp.AsType<int32x4_t>()(Number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset,
+                                                  static_cast<index_t>(coherence));
 
             tmp.AsType<int32x4_t>()(Number<1>{}) =
                 llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
                                                   src_thread_addr_offset,
                                                   src_wave_addr_offset + 4 * sizeof(int32_t),
-                                                  0);
+                                                  static_cast<index_t>(coherence));
             return tmp.AsType<int32x8_t>()(Number<0>{});
         }
     }
@@ -455,17 +513,23 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
     {
         if constexpr(N == 1)
         {
-            return llvm_amdgcn_raw_buffer_load_i8(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset,
+                                                  static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
 #if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
-            return llvm_amdgcn_raw_buffer_load_i8x2(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i8x2(src_wave_buffer_resource,
+                                                    src_thread_addr_offset,
+                                                    src_wave_addr_offset,
+                                                    static_cast<index_t>(coherence));
 #else
-            int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
+                                                          src_thread_addr_offset,
+                                                          src_wave_addr_offset,
+                                                          static_cast<index_t>(coherence));
 
             return bit_cast<int8x2_t>(tmp);
 #endif
@@ -473,11 +537,15 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
         else if constexpr(N == 4)
         {
 #if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
-            return llvm_amdgcn_raw_buffer_load_i8x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                    src_thread_addr_offset,
+                                                    src_wave_addr_offset,
+                                                    static_cast<index_t>(coherence));
 #else
-            int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource,
+                                                          src_thread_addr_offset,
+                                                          src_wave_addr_offset,
+                                                          static_cast<index_t>(coherence));
 
             return bit_cast<int8x4_t>(tmp);
 #endif
@@ -487,19 +555,24 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
 #if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
             vector_type<int8_t, 8> tmp;
 
-            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            tmp.AsType<int8x4_t>()(Number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset,
+                                                 static_cast<index_t>(coherence));
 
             tmp.AsType<int8x4_t>()(Number<1>{}) =
                 llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 4 * sizeof(int8_t),
-                                                 0);
+                                                 static_cast<index_t>(coherence));
 
             return tmp.AsType<int8x8_t>()(Number<0>{});
 #else
-            int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource,
+                                                              src_thread_addr_offset,
+                                                              src_wave_addr_offset,
+                                                              static_cast<index_t>(coherence));
 
             return bit_cast<int8x8_t>(tmp);
 #endif
@@ -509,31 +582,36 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
 #if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
             vector_type<int8_t, 16> tmp;
 
-            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            tmp.AsType<int8x4_t>()(Number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset,
+                                                 static_cast<index_t>(coherence));
 
             tmp.AsType<int8x4_t>()(Number<1>{}) =
                 llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 4 * sizeof(int8_t),
-                                                 0);
+                                                 static_cast<index_t>(coherence));
 
             tmp.AsType<int8x4_t>()(Number<2>{}) =
                 llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 8 * sizeof(int8_t),
-                                                 0);
+                                                 static_cast<index_t>(coherence));
 
             tmp.AsType<int8x4_t>()(Number<3>{}) =
                 llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 12 * sizeof(int8_t),
-                                                 0);
+                                                 static_cast<index_t>(coherence));
 
             return tmp.AsType<int8x16_t>()(Number<0>{});
 #else
-            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                              src_thread_addr_offset,
+                                                              src_wave_addr_offset,
+                                                              static_cast<index_t>(coherence));
 
             return bit_cast<int8x16_t>(tmp);
 #endif
@@ -541,7 +619,9 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
     }
 }
 
-template <typename T, index_t N>
+template <typename T,
+          index_t N,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src_thread_data,
                                       int32x4_t dst_wave_buffer_resource,
                                       index_t dst_thread_addr_offset,
@@ -565,7 +645,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
@@ -573,7 +653,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
         }
     }
     else if constexpr(is_same<T, float>::value)
@@ -584,7 +664,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
-                                              0);
+                                              static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
@@ -592,7 +672,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
@@ -600,7 +680,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
         }
     }
     else if constexpr(is_same<T, half_t>::value)
@@ -611,7 +691,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
-                                              0);
+                                              static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
@@ -619,7 +699,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
@@ -627,7 +707,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
         }
         else if constexpr(N == 8)
         {
@@ -638,19 +718,19 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
 
             llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<1>{}],
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset + 4 * sizeof(half_t),
-                                                0);
+                                                static_cast<index_t>(coherence));
 #else
             llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<float4_t>(src_thread_data),
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
 #endif
         }
     }
@@ -662,7 +742,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
-                                             0);
+                                             static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
@@ -670,7 +750,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
-                                               0);
+                                               static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
@@ -678,7 +758,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
-                                               0);
+                                               static_cast<index_t>(coherence));
         }
         else if constexpr(N == 8)
         {
@@ -688,13 +768,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
-                                               0);
+                                               static_cast<index_t>(coherence));
 
             llvm_amdgcn_raw_buffer_store_i16x4(tmp.AsType<bhalf4_t>()[Number<1>{}],
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset + 4 * sizeof(bhalf_t),
-                                               0);
+                                               static_cast<index_t>(coherence));
         }
     }
     else if constexpr(is_same<T, int32_t>::value)
@@ -705,7 +785,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
-                                             0);
+                                             static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
@@ -713,7 +793,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
-                                               0);
+                                               static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
@@ -721,7 +801,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
-                                               0);
+                                               static_cast<index_t>(coherence));
         }
     }
     else if constexpr(is_same<T, int8_t>::value)
@@ -732,7 +812,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
-                                            0);
+                                            static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
@@ -741,13 +821,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
-                                              0);
+                                              static_cast<index_t>(coherence));
 #else
             llvm_amdgcn_raw_buffer_store_i16(bit_cast<int16_t>(src_thread_data),
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
-                                             0);
+                                             static_cast<index_t>(coherence));
 #endif
         }
         else if constexpr(N == 4)
@@ -757,13 +837,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
-                                              0);
+                                              static_cast<index_t>(coherence));
 #else
             llvm_amdgcn_raw_buffer_store_i32(bit_cast<int32_t>(src_thread_data),
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
-                                             0);
+                                             static_cast<index_t>(coherence));
 #endif
         }
         else if constexpr(N == 8)
@@ -772,7 +852,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
-                                               0);
+                                               static_cast<index_t>(coherence));
         }
         else if constexpr(N == 16)
         {
@@ -780,7 +860,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
-                                               0);
+                                               static_cast<index_t>(coherence));
         }
     }
 }
@@ -1012,7 +1092,9 @@ __device__ void amd_buffer_atomic_max_impl(const typename vector_type<T, N>::typ
 //   1) p_src_wave must point to global memory space
 //   2) p_src_wave must be a wavewise pointer.
 // It is user's responsibility to make sure that is true.
-template <typename T, index_t N>
+template <typename T,
+          index_t N,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 __device__ typename vector_type_maker<T, N>::type::type
 amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
                                             index_t src_thread_element_offset,
@@ -1032,10 +1114,10 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
 #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
     uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
 
-    return amd_buffer_load_impl<scalar_t, vector_size>(
+    return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
         src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
 #else
-    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
+    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
         src_wave_buffer_resource, src_thread_addr_offset, 0);
 
     return src_thread_element_valid ? tmp : vector_t(0);
@@ -1046,7 +1128,9 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
 //   1) p_src_wave must point to global memory space
 //   2) p_src_wave must be a wavewise pointer.
 // It is user's responsibility to make sure that is true.
-template <typename T, index_t N>
+template <typename T,
+          index_t N,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 __device__ typename vector_type_maker<T, N>::type::type
 amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
                                                         index_t src_thread_element_offset,
@@ -1064,7 +1148,7 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
 
     constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
 
-    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
+    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
         src_wave_buffer_resource, src_thread_addr_offset, 0);
 
     return src_thread_element_valid ? tmp : vector_t(customized_value);
@@ -1074,7 +1158,9 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
 //   1) p_dst_wave must point to global memory
 //   2) p_dst_wave must be a wavewise pointer.
 // It is user's responsibility to make sure that is true.
-template <typename T, index_t N>
+template <typename T,
+          index_t N,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 __device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_data,
                                  T* p_dst_wave,
                                  const index_t dst_thread_element_offset,
@@ -1093,12 +1179,12 @@ __device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::t
 #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
     uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
 
-    amd_buffer_store_impl<scalar_t, vector_size>(
+    amd_buffer_store_impl<scalar_t, vector_size, coherence>(
         src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
 #else
     if(dst_thread_element_valid)
     {
-        amd_buffer_store_impl<scalar_t, vector_size>(
+        amd_buffer_store_impl<scalar_t, vector_size, coherence>(
             src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
     }
 #endif
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
index 9ea0d6c00..02d61f34e 100644
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -19,7 +19,8 @@ namespace ck {
 template <AddressSpaceEnum BufferAddressSpace,
           typename T,
           typename ElementSpaceSize,
-          bool InvalidElementUseNumericalZeroValue>
+          bool InvalidElementUseNumericalZeroValue,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 struct DynamicBuffer
 {
     using type = T;
@@ -77,13 +78,16 @@ struct DynamicBuffer
 
             if constexpr(InvalidElementUseNumericalZeroValue)
             {
-                return amd_buffer_load_invalid_element_return_zero<remove_cvref_t<T>, t_per_x>(
+                return amd_buffer_load_invalid_element_return_zero<remove_cvref_t<T>,
+                                                                   t_per_x,
+                                                                   coherence>(
                     p_data_, i, is_valid_element, element_space_size_);
             }
             else
             {
                 return amd_buffer_load_invalid_element_return_customized_value<remove_cvref_t<T>,
-                                                                               t_per_x>(
+                                                                               t_per_x,
+                                                                               coherence>(
                     p_data_, i, is_valid_element, element_space_size_, invalid_element_value_);
             }
         }
@@ -173,7 +177,7 @@ struct DynamicBuffer
         {
             constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
 
-            amd_buffer_store<remove_cvref_t<T>, t_per_x>(
+            amd_buffer_store<remove_cvref_t<T>, t_per_x, coherence>(
                 x, p_data_, i, is_valid_element, element_space_size_);
         }
         else if constexpr(GetAddressSpace() == AddressSpaceEnum::Lds &&
@@ -376,14 +380,19 @@ struct DynamicBuffer
     __host__ __device__ static constexpr bool IsDynamicBuffer() { return true; }
 };
 
-template <AddressSpaceEnum BufferAddressSpace, typename T, typename ElementSpaceSize>
+template <AddressSpaceEnum BufferAddressSpace,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence,
+          typename T,
+          typename ElementSpaceSize>
 __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
 {
-    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true>{p, element_space_size};
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true, coherence>{
+        p, element_space_size};
 }
 
 template <
     AddressSpaceEnum BufferAddressSpace,
+    AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence,
     typename T,
     typename ElementSpaceSize,
     typename X,
@@ -391,7 +400,7 @@ template <
 __host__ __device__ constexpr auto
 make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element_value)
 {
-    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false>{
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false, coherence>{
         p, element_space_size, invalid_element_value};
 }
 
-- 
GitLab


From 0ede66de54477a2c723e314f1ae934f8b51c40f8 Mon Sep 17 00:00:00 2001
From: ltqin <letao.qin@amd.com>
Date: Mon, 12 Jun 2023 21:35:31 +0800
Subject: [PATCH 54/71] Fix flash attn mask bug (#733)

* add check input parameter

* add instance for vector load = 1

* move gerneral instance to first pos

* fix read bias code

* regular code for bias load

---------

Co-authored-by: zjing14 <zhangjing14@gmail.com>
---
 .../gemm_bias_softmax_gemm_permute.cpp        |   3 +-
 ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp |  26 ++++-
 ...ultiple_d_softmax_gemm_xdl_cshuffle_v1.hpp | 110 +++++++-----------
 ...f16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp |  14 ++-
 ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp |  10 +-
 ...f16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp |   4 +-
 ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp |   4 +-
 7 files changed, 89 insertions(+), 82 deletions(-)

diff --git a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
index cfb42c6e1..a90a6340a 100644
--- a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
+++ b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
@@ -121,7 +121,8 @@ using DeviceOpInstance =
         2,              // CShuffleNXdlPerWavePerShuffle
         S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
         8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
-        MaskingSpec>;   // MaskingSpecialization
+        MaskingSpec,    // MaskingSpecialization
+        1>;
 
 // Ref Gemm0: fp16 in, fp32 out
 using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index e54c013cf..3fad319e9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -197,7 +197,8 @@ template <index_t NumDimG,
           typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
           index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
           MaskingSpecialization MaskingSpec,
-          LoopScheduler LoopSched = LoopScheduler::Default>
+          int D0sTransferSrcScalarPerVector = 4,
+          LoopScheduler LoopSched           = LoopScheduler::Default>
 struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
     : public DeviceBatchedGemmSoftmaxGemmPermute<NumDimG,
                                                  NumDimM,
@@ -438,7 +439,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
         CShuffleBlockTransferScalarPerVector_NPerBlock,
         LoopSched,
         Transform::matrix_padder.PadN,
-        MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle>;
+        MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle,
+        D0sTransferSrcScalarPerVector>;
 
     // Argument
     // FIXME: constness
@@ -530,6 +532,11 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
                 using D0DataType = remove_cvref_t<tuple_element_t<i.value, D0sDataType>>;
                 // D0 pointer
                 p_d0s_grid_(i) = static_cast<const D0DataType*>(p_acc0_biases[i]);
+                // for  check
+                d0s_nl_ns_lengths_strides_[i].push_back(
+                    acc0_biases_gs_ms_ns_lengths[i][NumDimG + NumDimM]);
+                d0s_nl_ns_lengths_strides_[i].push_back(
+                    acc0_biases_gs_ms_ns_strides[i][NumDimG + NumDimM]);
             });
 
             if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
@@ -608,6 +615,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
         std::vector<index_t> b_nz_kz_strides_;
         std::vector<index_t> b1_nz_kz_strides_;
         std::vector<index_t> c_mz_gemm1nz_strides_;
+        std::array<std::vector<ck::index_t>, NumD0Tensor> d0s_nl_ns_lengths_strides_;
 
         index_t batch_count_;
         ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
@@ -772,6 +780,20 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
         {
             return false;
         }
+        for(int i = 0; i < NumD0Tensor; i++)
+        {
+            if(arg.d0s_nl_ns_lengths_strides_[i][1] == 1 &&
+               arg.d0s_nl_ns_lengths_strides_[i][0] % D0sTransferSrcScalarPerVector != 0)
+            {
+                std::cout << "first" << std::endl;
+                return false;
+            }
+            if(arg.d0s_nl_ns_lengths_strides_[i][1] != 1 && D0sTransferSrcScalarPerVector != 1)
+            {
+                std::cout << "second" << std::endl;
+                return false;
+            }
+        }
 
         return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
                                            arg.b_grid_desc_bk0_n_bk1_,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
index 74171ea9d..135b9da6a 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -80,7 +80,8 @@ template <typename FloatAB,
           LoopScheduler LoopSched,
           bool PadN,
           bool MaskOutUpperTriangle,
-          PipelineVersion PipelineVer = PipelineVersion::v1>
+          int D0sTransferSrcScalarPerVector = 4,
+          PipelineVersion PipelineVer       = PipelineVersion::v1>
 struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
 {
     static_assert(LoopSched == LoopScheduler::Default,
@@ -621,13 +622,13 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
         constexpr auto d0_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5 =
             make_naive_tensor_descriptor_packed(make_tuple(I1,   // MBlockId
                                                            I1,   // NBlockID
-                                                           I1,   // MRepeat
-                                                           I1,   // NRepeat
-                                                           I1,   // MWaveId
-                                                           I1,   // NWaveId
-                                                           I1,   // MPerXdl
-                                                           I1,   // NGroupNum
-                                                           I1,   // NInputNum
+                                                           m0,   // MRepeat
+                                                           n0,   // NRepeat
+                                                           m1,   // MWaveId
+                                                           n1,   // NWaveId
+                                                           m2,   // MPerXdl
+                                                           n2,   // NGroupNum
+                                                           n3,   // NInputNum
                                                            n4)); // registerNum
 
         auto d0s_thread_buf = generate_tuple(
@@ -644,9 +645,6 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
         const auto wave_id     = GetGemm0WaveIdx();
         const auto wave_m_n_id = GetGemm0WaveMNIdx(wave_id[I2]); // I2: 0~63
 
-        constexpr auto acc0_thread_desc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<MXdlPerWave>{}, Number<NXdlPerWave>{}, n2, n4));
-
         auto d0s_threadwise_copy = generate_tuple(
             [&](auto i) {
                 using D0DataType = remove_cvref_t<tuple_element_t<i.value, D0sDataType>>;
@@ -655,10 +653,19 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
                     D0DataType,
                     decltype(d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i]),
                     decltype(d0_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5),
-                    Sequence<I1, I1, I1, I1, I1, I1, I1, I1, I1, n4>,
+                    Sequence<I1, // MBlockId
+                             I1, // NBlockID
+                             m0, // MRepeat
+                             n0, // NRepeat
+                             m1, // MWaveId
+                             n1, // NWaveId
+                             m2, // MPerXdl
+                             n2, // NGroupNum
+                             n3, // NInputNum
+                             n4>,
                     Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
                     9,
-                    n4,
+                    D0sTransferSrcScalarPerVector,
                     1,
                     false>(d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
                            make_multi_index(block_work_idx[I0], // MBlockId
@@ -884,62 +891,35 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
             // multiple d
             if constexpr(NumD0Tensor)
             {
-                static_for<0, MXdlPerWave, 1>{}([&](auto mr) {
-                    static_for<0, NXdlPerWave, 1>{}([&](auto nr) {
-                        static_for<0, n2, 1>{}([&](auto groupid) {
-                            static_for<0, NumD0Tensor, 1>{}([&](auto i) {
-                                d0s_threadwise_copy(i).Run(
-                                    d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
-                                    d0s_grid_buf[i],
-                                    d0_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
-                                    make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                                    d0s_thread_buf(i));
-                            });
-
-                            static_for<0, n4, 1>{}([&](auto i) {
-                                constexpr index_t c_offset = acc0_thread_desc.CalculateOffset(
-                                    make_tuple(mr, nr, groupid, i));
-
-                                // get reference to src data
-                                const auto src_data_refs = generate_tie(
-                                    // return type should be lvalue
-                                    [&](auto iSrc) -> const auto& {
-                                        return d0s_thread_buf[iSrc][i];
-                                    },
-                                    Number<NumD0Tensor>{});
-
-                                // get reference to dst data
-                                auto dst_data_refs = generate_tie(
-                                    // return type should be lvalue
-                                    [&](auto) -> auto& {
-                                        return acc_thread_buf(Number<c_offset>{});
-                                    },
-                                    Number<2>{});
-
-                                unpack2(c0de_element_op, dst_data_refs, src_data_refs);
-                            });
-                            static_for<0, NumD0Tensor, 1>{}([&](auto i) {
-                                d0s_threadwise_copy(i).MoveSrcSliceWindow(
-                                    d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
-                                    make_multi_index(0, 0, 0, 0, 0, 0, 0, 1, 0, 0));
-                            });
-                        });
-                        static_for<0, NumD0Tensor, 1>{}([&](auto i) {
-                            d0s_threadwise_copy(i).MoveSrcSliceWindow(
-                                d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
-                                make_multi_index(0, 0, 0, 1, 0, 0, 0, -n2.value, 0, 0));
-                        });
-                    });
-                    static_for<0, NumD0Tensor, 1>{}([&](auto i) {
-                        d0s_threadwise_copy(i).MoveSrcSliceWindow(
-                            d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
-                            make_multi_index(0, 0, 1, -NXdlPerWave, 0, 0, 0, 0, 0, 0));
-                    });
+                static_assert(NXdlPerWave == n0);
+                static_assert(MXdlPerWave == m0);
+
+                static_for<0, NumD0Tensor, 1>{}([&](auto i) {
+                    d0s_threadwise_copy(i).Run(d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
+                                               d0s_grid_buf[i],
+                                               d0_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+                                               make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                               d0s_thread_buf(i));
+                });
+                static_for<0, m0 * n0 * n2 * n4, 1>{}([&](auto i) {
+                    // get reference to src data
+                    const auto src_data_refs = generate_tie(
+                        // return type should be lvalue
+                        [&](auto iSrc) -> const auto& { return d0s_thread_buf[iSrc][i]; },
+                        Number<NumD0Tensor>{});
+
+                    // get reference to dst data
+                    auto dst_data_refs = generate_tie(
+                        // return type should be lvalue
+                        [&](auto) -> auto& { return acc_thread_buf(i); },
+                        Number<2>{});
+
+                    unpack2(c0de_element_op, dst_data_refs, src_data_refs);
                 });
                 static_for<0, NumD0Tensor, 1>{}([&](auto i) {
                     d0s_threadwise_copy(i).MoveSrcSliceWindow(
                         d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
-                        make_multi_index(0, 1, -MXdlPerWave, 0, 0, 0, 0, 0, 0, 0));
+                        make_multi_index(0, 1, 0, 0, 0, 0, 0, 0, 0, 0));
                 });
             }
             else
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
index a541a2d22..498bf58fb 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
@@ -41,10 +41,11 @@ template <index_t NumDimG,
 using device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances =
     std::tuple<
         // clang-format off
-        // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO|  AData|  B0Data|  B1Data|  CData|     Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|
-        // #############################################|         |        |        |        |        |   Type|    Type|    Type|   Type|             Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            |
-        // #############################################|         |        |        |        |        |       |        |        |       |                 |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            |
-        // #############################################|         |        |        |        |        |       |        |        |       |                 |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |
+        // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO|  AData|  B0Data|  B1Data|  CData|     Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|  D0s Bias|
+        // #############################################|         |        |        |        |        |   Type|    Type|    Type|   Type|             Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            | SrcScalar|
+        // #############################################|         |        |        |        |        |       |        |        |       |                 |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            | PerVector|
+        // #############################################|         |        |        |        |        |       |        |        |       |                 |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |          |
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec,        1>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   256,    32,    64,    32,   8,   8,    2,   32,   32,     1,     8,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
@@ -58,8 +59,9 @@ using device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         // Padded fallback kernel
-        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
-        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec,        1>,  
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
         // clang-format on
         >;
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index 5e481603c..744bd6456 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -41,10 +41,11 @@ template <index_t NumDimG,
 using device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances =
     std::tuple<
         // clang-format off
-        // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO| AData| B0Data| B1Data| CData|    Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|
-        // #############################################|         |        |        |        |        |  Type|   Type|   Type|  Type|            Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            |
-        // #############################################|         |        |        |        |        |      |       |       |      |                |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            |
-        // #############################################|         |        |        |        |        |      |       |       |      |                |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |
+        // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO| AData| B0Data| B1Data| CData|    Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|  D0s Bias|
+        // #############################################|         |        |        |        |        |  Type|   Type|   Type|  Type|            Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            | SrcScalar|
+        // #############################################|         |        |        |        |        |      |       |       |      |                |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            | PerVector|
+        // #############################################|         |        |        |        |        |      |       |       |      |                |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |          |
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec,       1>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
  #if CK_WORKAROUND_SWDEV_388832
@@ -60,6 +61,7 @@ using device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         // Padded fallback kernel
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec,       1>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
         // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
index 501ea85f6..b342612d1 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
@@ -45,6 +45,7 @@ using device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_
         // #############################################|         |        |        |        |        |   Type|    Type|    Type|   Type|         Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            |
         // #############################################|         |        |        |        |        |       |        |        |       |             |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            |
         // #############################################|         |        |        |        |        |       |        |        |       |             |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   256,    32,    64,    32,   8,   8,    2,   32,   32,     1,     8,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
@@ -58,8 +59,7 @@ using device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         // Padded fallback kernel
-        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
-        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
         // clang-format on
         >;
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index 91ab541bf..3fd0c0737 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -45,6 +45,7 @@ using device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_
         // #############################################|         |        |        |        |        |  Type|   Type|   Type|  Type|         Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            |
         // #############################################|         |        |        |        |        |      |       |       |      |             |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            |
         // #############################################|         |        |        |        |        |      |       |       |      |             |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
  #if CK_WORKAROUND_SWDEV_388832
@@ -60,8 +61,7 @@ using device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         // Padded fallback kernel
-        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
-        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
         // clang-format on
         >;
 
-- 
GitLab


From 7c24654c248bbcc003062302a64c3f08d8422906 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Mon, 12 Jun 2023 21:36:40 +0800
Subject: [PATCH 55/71] Fix incomplete object size (=4n + 3) support of
 amd_wave_read_first_lane() (#738)

* Fix wrong pointer type

* Rename type trait get_unsigned_int<> to get_carrier<>

* Add 3-bytes carrier type

* Add missing __device__ specifier

* Rename template non-type parameter

* Leave the rest byte uninitialized

* Avoid invoking (host) STL algorithms

* Remove unnecessary 'inline' specifier

* Extract common logic out as helper method

* Hide dummy member function

* Add missing __device__ specifier
---
 .../ck/utility/amd_wave_read_first_lane.hpp   | 75 ++++++++++++++++---
 1 file changed, 65 insertions(+), 10 deletions(-)

diff --git a/include/ck/utility/amd_wave_read_first_lane.hpp b/include/ck/utility/amd_wave_read_first_lane.hpp
index 4652ce7a7..741b2975a 100644
--- a/include/ck/utility/amd_wave_read_first_lane.hpp
+++ b/include/ck/utility/amd_wave_read_first_lane.hpp
@@ -7,6 +7,7 @@
 #include "ck/utility/functional2.hpp"
 #include "ck/utility/math.hpp"
 
+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <type_traits>
@@ -14,29 +15,83 @@
 namespace ck {
 namespace detail {
 
-template <unsigned Size>
-struct get_unsigned_int;
+template <unsigned SizeInBytes>
+struct get_carrier;
 
 template <>
-struct get_unsigned_int<1>
+struct get_carrier<1>
 {
     using type = uint8_t;
 };
 
 template <>
-struct get_unsigned_int<2>
+struct get_carrier<2>
 {
     using type = uint16_t;
 };
 
 template <>
-struct get_unsigned_int<4>
+struct get_carrier<3>
+{
+    using type = class carrier
+    {
+        using value_type = uint32_t;
+
+        std::array<std::byte, 3> bytes;
+        static_assert(sizeof(bytes) <= sizeof(value_type));
+
+        // replacement of host std::copy_n()
+        template <typename InputIterator, typename Size, typename OutputIterator>
+        __device__ static OutputIterator copy_n(InputIterator from, Size size, OutputIterator to)
+        {
+            if(0 < size)
+            {
+                *to = *from;
+                ++to;
+                for(Size count = 1; count < size; ++count)
+                {
+                    *to = *++from;
+                    ++to;
+                }
+            }
+
+            return to;
+        }
+
+        // method to trigger template substitution failure
+        __device__ carrier(const carrier& other) noexcept
+        {
+            copy_n(other.bytes.begin(), bytes.size(), bytes.begin());
+        }
+
+        public:
+        __device__ carrier& operator=(value_type value) noexcept
+        {
+            copy_n(reinterpret_cast<const std::byte*>(&value), bytes.size(), bytes.begin());
+
+            return *this;
+        }
+
+        __device__ operator value_type() const noexcept
+        {
+            std::byte result[sizeof(value_type)];
+
+            copy_n(bytes.begin(), bytes.size(), result);
+
+            return *reinterpret_cast<const value_type*>(result);
+        }
+    };
+};
+static_assert(sizeof(get_carrier<3>::type) == 3);
+
+template <>
+struct get_carrier<4>
 {
     using type = uint32_t;
 };
 
-template <unsigned Size>
-using get_unsigned_int_t = typename get_unsigned_int<Size>::type;
+template <unsigned SizeInBytes>
+using get_carrier_t = typename get_carrier<SizeInBytes>::type;
 
 } // namespace detail
 
@@ -61,7 +116,7 @@ __device__ auto amd_wave_read_first_lane(const Object& obj)
     constexpr Size CompleteSgprCopyBoundary = ObjectSize - RemainedSize;
     for(Size offset = 0; offset < CompleteSgprCopyBoundary; offset += SgprSize)
     {
-        using Sgpr = detail::get_unsigned_int_t<SgprSize>;
+        using Sgpr = detail::get_carrier_t<SgprSize>;
 
         *reinterpret_cast<Sgpr*>(to_obj + offset) =
             amd_wave_read_first_lane(*reinterpret_cast<const Sgpr*>(from_obj + offset));
@@ -69,9 +124,9 @@ __device__ auto amd_wave_read_first_lane(const Object& obj)
 
     if constexpr(0 < RemainedSize)
     {
-        using Carrier = detail::get_unsigned_int_t<RemainedSize>;
+        using Carrier = detail::get_carrier_t<RemainedSize>;
 
-        *reinterpret_cast<Carrier>(to_obj + CompleteSgprCopyBoundary) = amd_wave_read_first_lane(
+        *reinterpret_cast<Carrier*>(to_obj + CompleteSgprCopyBoundary) = amd_wave_read_first_lane(
             *reinterpret_cast<const Carrier*>(from_obj + CompleteSgprCopyBoundary));
     }
 
-- 
GitLab


From fc9f97568ffcecf36c3f59f4b0d6680720ddd099 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Mon, 12 Jun 2023 15:37:15 +0200
Subject: [PATCH 56/71] Add DeviceBatchedGemmMultipleD_Dl (#732)

* Add DeviceBatchedGemmMultipleD_Dl

* Fix batched_gemm tests

* Fix comments

* test_batched_gemm_multi_d fixes

* Fix args for isSupported batchedGemmMultipleDDl

* Disable tests for gfx90a
---
 .../device_batched_gemm_multiple_d_dl.hpp     | 796 ++++++++++++++++++
 .../gpu/batched_gemm_multi_d.hpp              | 337 ++++++++
 .../gpu/batched_gemm_multi_d/CMakeLists.txt   |  18 +
 ..._d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp |  95 +++
 ...f16_f16_gkm_gkn_gmn_irregular_instance.cpp |  84 ++
 ..._d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp |  95 +++
 ...f16_f16_gkm_gnk_gmn_irregular_instance.cpp |  83 ++
 ..._d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp |  95 +++
 ...f16_f16_gmk_gkn_gmn_irregular_instance.cpp |  83 ++
 ..._d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp |  95 +++
 ...f16_f16_gmk_gnk_gmn_irregular_instance.cpp |  83 ++
 ...lti_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp |  93 ++
 ...8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp |  90 ++
 ...lti_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp |  93 ++
 ...8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp |  90 ++
 ...lti_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp |  93 ++
 ...8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp |  90 ++
 ...lti_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp |  93 ++
 ...8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp |  90 ++
 profiler/README.md                            |  27 +
 .../profiler/profile_batched_gemm_impl.hpp    |  95 ++-
 profiler/src/CMakeLists.txt                   |   3 +-
 profiler/src/profile_batched_gemm.cpp         | 117 +--
 profiler/src/profile_batched_gemm_multi_d.cpp | 190 +++++
 test/CMakeLists.txt                           |   1 +
 test/batched_gemm/batched_gemm_bf16.cpp       |  94 ++-
 test/batched_gemm/batched_gemm_fp16.cpp       |  94 ++-
 test/batched_gemm/batched_gemm_fp32.cpp       |  94 ++-
 test/batched_gemm/batched_gemm_int8.cpp       |  94 ++-
 test/batched_gemm_multi_d/CMakeLists.txt      |   5 +
 .../test_batched_gemm_multi_d.cpp             |  74 ++
 31 files changed, 3354 insertions(+), 130 deletions(-)
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp
 create mode 100644 profiler/src/profile_batched_gemm_multi_d.cpp
 create mode 100644 test/batched_gemm_multi_d/CMakeLists.txt
 create mode 100644 test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
new file mode 100644
index 000000000..0df365ceb
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
@@ -0,0 +1,796 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+/*
+ * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
+ * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
+ * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
+ * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
+ * limitations.
+ *
+ * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
+ * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
+ * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
+ * impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for
+ * \link DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the
+ * computing of pointer offset into \p ComputePtrOffsetOfStridedBatch.
+ */
+
+template <typename GridwiseGemm,
+          typename ABDataType,
+          typename DsPointer,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AGridDesc_K0_M0_M1_K1,
+          typename BGridDesc_K0_N0_N1_K1,
+          typename DsGridDesc_M0_M10_M11_N0_N10_N11,
+          typename CGridDesc_M0_M10_M11_N0_N10_N11,
+          typename ComputePtrOffsetOfBatch,
+          typename Block2CTileMap,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dl_multiple_d(
+            const ABDataType* __restrict__ p_a_grid,
+            const ABDataType* __restrict__ p_b_grid,
+            DsPointer p_ds_grid,
+            EDataType* __restrict__ p_e_grid,
+            const index_t batch_count,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
+            const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
+            const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11,
+            const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            const Block2CTileMap block_2_ctile_map)
+{
+// TODO: Enable for gfx90a after complier fix
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||              \
+    defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || defined(__gfx1101__) || \
+    defined(__gfx1102__))
+
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+
+    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    DsPointer p_ds_grid_grp;
+
+    static constexpr index_t NumDTensor = DsGridDesc_M0_M10_M11_N0_N10_N11::Size();
+
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+
+    GridwiseGemm::Run(p_a_grid + a_batch_offset,
+                      p_b_grid + b_batch_offset,
+                      p_ds_grid_grp,
+                      p_e_grid + e_batch_offset,
+                      p_shared,
+                      a_element_op,
+                      b_element_op,
+                      cde_element_op,
+                      a_grid_desc_k0_m0_m1_k1,
+                      b_grid_desc_k0_n0_n1_k1,
+                      ds_grid_desc_m0_m10_m11_n0_n10_n11,
+                      e_grid_desc_m0_m10_m11_n0_n10_n11,
+                      block_2_ctile_map,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = batch_count;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = a_grid_desc_k0_m0_m1_k1;
+    ignore = b_grid_desc_k0_n0_n1_k1;
+    ignore = ds_grid_desc_m0_m10_m11_n0_n10_n11;
+    ignore = e_grid_desc_m0_m10_m11_n0_n10_n11;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = block_2_ctile_map;
+
+#endif
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t K0PerBlock,
+          index_t K1,
+          index_t M1PerThread,
+          index_t N1PerThread,
+          index_t KPerThread,
+          typename M1N1ThreadClusterM1Xs,
+          typename M1N1ThreadClusterN1Xs,
+          typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+          typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+          typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+          typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          enable_if_t<
+              is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+                  is_same_v<BElementwiseOperation, ck::tensor_operation::element_wise::PassThrough>,
+              bool> = false>
+struct DeviceBatchedGemmMultipleD_Dl : public DeviceBatchedGemmMultiD<ALayout,
+                                                                      BLayout,
+                                                                      DsLayout,
+                                                                      ELayout,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      DsDataType,
+                                                                      EDataType,
+                                                                      AElementwiseOperation,
+                                                                      BElementwiseOperation,
+                                                                      CDEElementwiseOperation>
+
+{
+    using DeviceOp                      = DeviceBatchedGemmMultipleD_Dl;
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+
+    static constexpr auto K1Number = Number<K1>{};
+
+    static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
+    {
+        const index_t K0 = K / K1;
+
+        const auto a_grid_desc_m_k = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_right_pad_transform(M, PadM)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+    }
+
+    static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
+    {
+        const index_t K0 = K / K1;
+
+        const auto b_grid_desc_k_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
+            }
+        }();
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+            return transform_tensor_descriptor(
+                b_grid_desc_k_n,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_right_pad_transform(N, PadN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                b_grid_desc_k_n,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+    }
+
+    template <typename ELay>
+    static auto MakeEGridDescriptor_M_N(index_t M, index_t N, index_t StrideE)
+    {
+        const auto c_grid_desc_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideE, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideE));
+            }
+        }();
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
+    static auto MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
+                                         const std::array<index_t, NumDTensor>& NRaws,
+                                         const std::array<index_t, NumDTensor>& DsStride)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(MRaws[i], NRaws[i], DsStride[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1));
+    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1));
+    using DsGridDesc_M_N    = decltype(MakeDsGridDescriptor_M_N({}, {}, {}));
+    using EGridDesc_M_N     = decltype(MakeEGridDescriptor_M_N<ELayout>(1, 1, 1));
+
+    struct ComputePtrOffsetOfStridedBatch
+    {
+        ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                       index_t BatchStrideB,
+                                       std::array<ck::index_t, NumDTensor> BatchStrideDs,
+                                       index_t BatchStrideE)
+            : BatchStrideA_(BatchStrideA),
+              BatchStrideB_(BatchStrideB),
+              BatchStrideDs_(BatchStrideDs),
+              BatchStrideE_(BatchStrideE)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+
+        __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
+        {
+            std::array<long_index_t, NumDTensor> ds_offset;
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                ds_offset[i] = g_idx * static_cast<long_index_t>(BatchStrideDs_[i]);
+            });
+            return ds_offset;
+        }
+
+        __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideE_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        std::array<ck::index_t, NumDTensor> BatchStrideDs_;
+        index_t BatchStrideE_;
+    };
+
+    // GridwiseGemm
+    using GridwiseGemm =
+        GridwiseGemmDlMultipleD_km_kn_mn<BlockSize,
+                                         ADataType,
+                                         AccDataType,
+                                         DsDataType,
+                                         EDataType,
+                                         AElementwiseOperation,
+                                         BElementwiseOperation,
+                                         CDEElementwiseOperation,
+                                         InMemoryDataOperationEnum::Set,
+                                         AGridDesc_K0_M_K1,
+                                         BGridDesc_K0_N_K1,
+                                         EGridDesc_M_N,
+                                         MPerBlock,
+                                         NPerBlock,
+                                         K0PerBlock,
+                                         K1,
+                                         M1PerThread,
+                                         N1PerThread,
+                                         KPerThread,
+                                         M1N1ThreadClusterM1Xs,
+                                         M1N1ThreadClusterN1Xs,
+                                         ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+                                         ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+                                         ABlockTransferThreadClusterArrangeOrder,
+                                         ABlockTransferSrcAccessOrder,
+                                         ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+                                         ABlockTransferSrcVectorTensorContiguousDimOrder,
+                                         ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+                                         BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+                                         BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+                                         BBlockTransferThreadClusterArrangeOrder,
+                                         BBlockTransferSrcAccessOrder,
+                                         BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+                                         BBlockTransferSrcVectorTensorContiguousDimOrder,
+                                         BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+                                         CThreadTransferSrcDstAccessOrder,
+                                         CThreadTransferSrcDstVectorDim,
+                                         CThreadTransferDstScalarPerVector>;
+
+    using AGridDesc_K0_M0_M1_K1 =
+        decltype(GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(AGridDesc_K0_M_K1{}));
+    using BGridDesc_K0_N0_N1_K1 =
+        decltype(GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(BGridDesc_K0_N_K1{}));
+    using DsGridDesc_M0_M10_M11_N0_N10_N11 =
+        decltype(GridwiseGemm::MakeDsGridDescriptor_M0_M10_M11_N0_N10_N11(DsGridDesc_M_N{}));
+    using EGridDesc_M0_M10_M11_N0_N10_N11 =
+        decltype(GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(EGridDesc_M_N{}));
+    using DefaultBlock2CTileMap =
+        decltype(GridwiseGemm::MakeDefaultBlock2CTileMap(EGridDesc_M_N{}));
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a_grid,
+                 const void* p_b_grid,
+                 std::array<const void*, NumDTensor> p_ds_grid,
+                 void* p_e_grid,
+                 index_t M,
+                 index_t N,
+                 index_t K,
+                 index_t Batch,
+                 index_t StrideA,
+                 index_t StrideB,
+                 std::array<index_t, NumDTensor> StrideDs,
+                 index_t StrideE,
+                 index_t BatchStrideA,
+                 index_t BatchStrideB,
+                 const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                 index_t BatchStrideE,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a_grid)},
+              p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e_grid)},
+              K_(K),
+              Batch_(Batch),
+              a_grid_desc_k0_m0_m1_k1_{},
+              b_grid_desc_k0_n0_n1_k1_{},
+              e_grid_desc_m0_m10_m11_n0_n10_n11_{},
+              compute_ptr_offset_of_batch_{BatchStrideA, BatchStrideB, BatchStrideDs, BatchStrideE},
+              block_2_ctile_map_{},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+            a_grid_desc_k0_m_k1_ =
+                DeviceBatchedGemmMultipleD_Dl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA);
+            b_grid_desc_k0_n_k1_ =
+                DeviceBatchedGemmMultipleD_Dl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB);
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds_grid[i]);
+
+                // D desc
+                ds_grid_desc_m_n_(i) =
+                    DeviceOp::MakeEGridDescriptor_M_N<DLayout>(M, N, StrideDs[i]);
+            });
+            e_grid_desc_m_n_ =
+                DeviceBatchedGemmMultipleD_Dl::MakeEGridDescriptor_M_N<ELayout>(M, N, StrideE);
+
+            if(GridwiseGemm::CheckValidity(
+                   a_grid_desc_k0_m_k1_, b_grid_desc_k0_n_k1_, e_grid_desc_m_n_))
+            {
+                a_grid_desc_k0_m0_m1_k1_ =
+                    GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(a_grid_desc_k0_m_k1_);
+                b_grid_desc_k0_n0_n1_k1_ =
+                    GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(b_grid_desc_k0_n_k1_);
+
+                ds_grid_desc_m0_m10_m11_n0_n10_n11_ =
+                    GridwiseGemm::MakeDsGridDescriptor_M0_M10_M11_N0_N10_N11(ds_grid_desc_m_n_);
+
+                e_grid_desc_m0_m10_m11_n0_n10_n11_ =
+                    GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(e_grid_desc_m_n_);
+
+                block_2_ctile_map_ = GridwiseGemm::MakeDefaultBlock2CTileMap(e_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        index_t K_;
+
+        // Batch
+        index_t Batch_;
+
+        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1_;
+        BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1_;
+        DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11_;
+        EGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11_;
+
+        // for calculating batch offset
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
+
+        DefaultBlock2CTileMap block_2_ctile_map_;
+
+        // TODO: unused since gridwise_gemm_dl_v1r3 does NOT support prologue for the time being.
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceBatchedGemmMultipleD_Dl::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            {
+                std::cout << "arg.a_grid_desc_k0_m0_m1_k1_{"
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I0) << ", "
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_k0_n0_n1_k1_{"
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I0) << ", "
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.e_grid_desc_m_n_{ " << arg.e_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.e_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+            }
+
+            if(!GridwiseGemm::CheckValidity(
+                   arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.e_grid_desc_m_n_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemmDlMultipleD_km_kn_mn has invalid setting");
+            }
+
+            const index_t grid_size =
+                GridwiseGemm::CalculateGridSize(arg.e_grid_desc_m_n_.GetLength(I0),
+                                                arg.e_grid_desc_m_n_.GetLength(I1)) *
+                arg.Batch_;
+
+            auto launch_kernel = [&](auto has_main_k_block_loop,
+                                     auto has_double_tail_k_block_loop) {
+                constexpr bool has_main_loop   = has_main_k_block_loop.value;
+                constexpr bool has_double_loop = has_double_tail_k_block_loop.value;
+
+                const auto kernel =
+                    kernel_gemm_dl_multiple_d<GridwiseGemm,
+                                              ADataType,
+                                              typename GridwiseGemm::DsGridPointer,
+                                              EDataType,
+                                              AElementwiseOperation,
+                                              BElementwiseOperation,
+                                              CDEElementwiseOperation,
+                                              DeviceOp::AGridDesc_K0_M0_M1_K1,
+                                              DeviceOp::BGridDesc_K0_N0_N1_K1,
+                                              DeviceOp::DsGridDesc_M0_M10_M11_N0_N10_N11,
+                                              DeviceOp::EGridDesc_M0_M10_M11_N0_N10_N11,
+                                              ComputePtrOffsetOfStridedBatch,
+                                              DefaultBlock2CTileMap,
+                                              has_main_loop,
+                                              has_double_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              arg.Batch_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.a_grid_desc_k0_m0_m1_k1_,
+                                              arg.b_grid_desc_k0_n0_n1_k1_,
+                                              arg.ds_grid_desc_m0_m10_m11_n0_n10_n11_,
+                                              arg.e_grid_desc_m0_m10_m11_n0_n10_n11_,
+                                              arg.compute_ptr_offset_of_batch_,
+                                              arg.block_2_ctile_map_);
+            };
+
+            const auto K0                    = arg.a_grid_desc_k0_m0_m1_k1_.GetLength(I0);
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K0);
+            const bool has_double_tail_k_block_loop =
+                GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K0);
+
+            if(has_main_k_block_loop && has_double_tail_k_block_loop)
+            {
+                return launch_kernel(integral_constant<bool, true>{},
+                                     integral_constant<bool, true>{});
+            }
+            else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+            {
+                return launch_kernel(integral_constant<bool, true>{},
+                                     integral_constant<bool, false>{});
+            }
+            else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+            {
+                return launch_kernel(integral_constant<bool, false>{},
+                                     integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{},
+                                     integral_constant<bool, false>{});
+            }
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        // TODO: Enable for gfx90a after complier fix
+        if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx908" ||
+           ck::get_device_name() == "gfx1030" || ck::get_device_name() == "gfx940" ||
+           ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
+           ck::get_device_name() == "gfx1102")
+        {
+            bool pass = true;
+            pass      = pass && arg.K_ % K1 == 0;
+
+            pass = pass && GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                                       arg.b_grid_desc_k0_n_k1_,
+                                                       arg.e_grid_desc_m_n_);
+
+            return pass;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_e,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t Batch,
+                             index_t StrideA,
+                             index_t StrideB,
+                             std::array<ck::index_t, NumDTensor> StrideDs,
+                             index_t StrideE,
+                             index_t BatchStrideA,
+                             index_t BatchStrideB,
+                             const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                             index_t BatchStrideE,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        M,
+                        N,
+                        K,
+                        Batch,
+                        StrideA,
+                        StrideB,
+                        StrideDs,
+                        StrideE,
+                        BatchStrideA,
+                        BatchStrideB,
+                        BatchStrideDs,
+                        BatchStrideE,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const std::array<const void*, NumDTensor>& p_ds,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t Batch,
+                        index_t StrideA,
+                        index_t StrideB,
+                        const std::array<ck::index_t, NumDTensor>& StrideDs,
+                        index_t StrideE,
+                        index_t BatchStrideA,
+                        index_t BatchStrideB,
+                        const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                        index_t BatchStrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          M,
+                                          N,
+                                          K,
+                                          Batch,
+                                          StrideA,
+                                          StrideB,
+                                          StrideDs,
+                                          StrideE,
+                                          BatchStrideA,
+                                          BatchStrideB,
+                                          BatchStrideDs,
+                                          BatchStrideE,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedGemmMultipleD_Dl"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock << ", "
+            << K1 << ", "
+            << M1PerThread << ", "
+            << N1PerThread << ", "
+            << KPerThread
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp
new file mode 100644
index 000000000..ae12f4c7a
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp
@@ -0,0 +1,337 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances);
+
+template <typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatchedGemmMultiD<
+    ALayout,
+    BLayout,
+    Empty_Tuple,
+    ELayout,
+    ADataType,
+    BDataType,
+    Empty_Tuple,
+    EDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceBatchedGemmMultiD<ALayout,
+                                             BLayout,
+                                             Empty_Tuple,
+                                             ELayout,
+                                             ADataType,
+                                             BDataType,
+                                             Empty_Tuple,
+                                             EDataType,
+                                             ck::tensor_operation::element_wise::PassThrough,
+                                             ck::tensor_operation::element_wise::PassThrough,
+                                             ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instances(op_ptrs);
+                add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instances(op_ptrs);
+                add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instances(op_ptrs);
+                add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instances(op_ptrs);
+                add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instances(
+                    op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
+                          is_same_v<EDataType, int8_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instances(op_ptrs);
+                add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instances(op_ptrs);
+                add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instances(op_ptrs);
+                add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instances(op_ptrs);
+                add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instances(
+                    op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt
new file mode 100644
index 000000000..fda55a930
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_instance_library(device_batched_gemm_multi_d_instance
+   device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+   device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+   device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+   device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+   device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp
+   device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp
+   device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp
+   device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp
+   device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp
+   device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp
+   device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp
+   device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp
+   device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp
+   device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp
+   device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp
+   device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
new file mode 100644
index 000000000..3fe9f78b2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout| AData| BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |  Type|  Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |      |      |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |      |      |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<4, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<16, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp
new file mode 100644
index 000000000..4ab22bb03
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout| AData| BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |  Type|  Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |      |      |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |      |      |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<4, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<16, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
new file mode 100644
index 000000000..80c890cdb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout| AData| BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |  Type|  Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |      |      |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |      |      |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128       
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp
new file mode 100644
index 000000000..647c58303
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout| AData| BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |  Type|  Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |      |      |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |      |      |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128       
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64       
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
new file mode 100644
index 000000000..3ce582f1f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout| AData| BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |  Type|  Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |      |      |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |      |      |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp
new file mode 100644
index 000000000..34c29d038
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout| AData| BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |  Type|  Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |      |      |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |      |      |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,       S<4, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
new file mode 100644
index 000000000..e0da8e3d5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout| AData| BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |  Type|  Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |      |      |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |      |      |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp
new file mode 100644
index 000000000..3edd3f78c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout| AData| BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |  Type|  Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |      |      |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |      |      |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row,   F16,   F16,     F32, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        Empty_Tuple,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp
new file mode 100644
index 000000000..33234edc1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout|  AData|  BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |   Type|   Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |       |       |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |       |       |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  4,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<1, 1, 4, 4>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<4, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  4,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<1, 1, 4, 4>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<4, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  4,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  4,          4,          1,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  4,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 4>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<4, 1, 4, 4>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  4,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 4>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<8, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  4,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 4>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<8, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  4,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  4,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  4,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  4,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  4,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  4,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp
new file mode 100644
index 000000000..16107e1bd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout|  AData|  BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |   Type|   Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |       |       |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |       |       |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<4, 4>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,    32,    32,    32,     8,  4,          2,          4,      1,       S<4, 2>,       S<2, 2>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<4, 2>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,    16,    16,    16,    16,  4,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 4>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 4, 4>,       S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,    16,    16,    16,    16,  4,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 4>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 4, 4>,       S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,     8,    64,    32,  4,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 4>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 4, 4>,      S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,     8,    32,  4,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<1, 1, 4, 4>,      S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,     8,     8,     8,     4,  4,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<1, 1, 4, 4>,       S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,     8,     8,     8,     4,  4,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<1, 1, 4, 4>,       S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,     8,     8,     8,     4,  4,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<1, 1, 4, 4>,       S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, GemmMNPadding,     8,     8,     8,     4,  4,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<1, 1, 4, 4>,       S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp
new file mode 100644
index 000000000..3e4bdb017
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout|  AData|  BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |   Type|   Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |       |       |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |       |       |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,      S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp
new file mode 100644
index 000000000..810771716
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout|  AData|  BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |   Type|   Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |       |       |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |       |       |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<4, 4>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          2,          4,      1,       S<4, 2>,       S<2, 2>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,      S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<4, 2>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,      S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,      S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp
new file mode 100644
index 000000000..0e943c88c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout|  AData|  BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |   Type|   Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |       |       |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |       |       |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp
new file mode 100644
index 000000000..ea5e7c562
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout|  AData|  BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |   Type|   Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |       |       |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |       |       |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<4, 4>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          2,          4,      1,       S<4, 2>,       S<2, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp
new file mode 100644
index 000000000..000a4b013
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout|  AData|  BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |   Type|   Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |       |       |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |       |       |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp
new file mode 100644
index 000000000..24fb67619
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout|  AData|  BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |   Type|   Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |       |       |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |       |       |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<4, 4>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          2,          4,      1,       S<4, 2>,       S<2, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/README.md b/profiler/README.md
index 400a64a39..7a1fb2911 100644
--- a/profiler/README.md
+++ b/profiler/README.md
@@ -76,3 +76,30 @@ e_m_n: dim 4, lengths {128, 128, 128, 128}, strides {2097152, 16384, 128, 1}
 ....
 Best Perf: 211.405 ms, 41.6077 TFlops, 15.2372 GB/s
 ```
+
+## Profile batched gemm multiple D kernels
+```bash
+#arg1: tensor operation (batched_gemm_multi_d=Batched GEMM multi D);
+#arg2: data type (0: fp16; 1: int8)
+#arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];
+#                     1: A[g, m, k] * B[g, n, k] = C[g, m, n];
+#                     2: A[g, k, m] * B[g, k, n] = C[g, m, n];
+#                     3: A[g, k, m] * B[g, n, k] = C[g, m, n])
+#arg4: verification (0: no; 1: yes)
+#arg5: initialization (0: no init; 1: integer value; 2: decimal value)
+#arg6: print tensor value (0: no; 1: yes)
+#arg7: time kernel (0=n0, 1=yes)
+#arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount
+
+################                   op  datatype  layout  verify  init  log  time    M    N    K StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
+./bin/ckProfiler batched_gemm_multi_d         0       1       0     0    0     1 4096 4096 4096    4096    4096    4096     16777216     16777216     16777216         16
+```
+
+Result (Radeon RX 6800 XT)
+```bash
+arg.a_grid_desc_k0_m0_m1_k1_{2048, 4096, 2}
+arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
+arg.e_grid_desc_m_n_{ 4096, 4096}
+....
+Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s
+```
diff --git a/profiler/include/profiler/profile_batched_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_impl.hpp
index cdc94aa9a..936c22f5d 100644
--- a/profiler/include/profiler/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_impl.hpp
@@ -8,9 +8,11 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
 
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
@@ -27,7 +29,11 @@ template <typename ADataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
-          typename CLayout>
+          typename CLayout,
+          typename AElementOp,
+          typename BElementOp,
+          typename CElementOp,
+          typename DeviceOp>
 bool profile_batched_gemm_impl(int do_verification,
                                int init_method,
                                bool do_log,
@@ -88,10 +94,6 @@ bool profile_batched_gemm_impl(int do_verification,
         b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
     }
 
-    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
-
     const auto a_element_op = AElementOp{};
     const auto b_element_op = BElementOp{};
     const auto c_element_op = CElementOp{};
@@ -124,16 +126,6 @@ bool profile_batched_gemm_impl(int do_verification,
     b_device_buf.ToDevice(b_g_k_n.mData.data());
     c_device_buf.ToDevice(c_g_m_n_device_result.mData.data());
 
-    using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
-                                                                     BLayout,
-                                                                     CLayout,
-                                                                     ADataType,
-                                                                     BDataType,
-                                                                     CDataType,
-                                                                     AElementOp,
-                                                                     BElementOp,
-                                                                     CElementOp>;
-
     // get device op instances
     const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
         DeviceOp>::GetInstances();
@@ -148,23 +140,62 @@ bool profile_batched_gemm_impl(int do_verification,
     // profile device op instances
     for(auto& op_ptr : op_ptrs)
     {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
-                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
-                                        M,
-                                        N,
-                                        K,
-                                        StrideA,
-                                        StrideB,
-                                        StrideC,
-                                        BatchStrideA,
-                                        BatchStrideB,
-                                        BatchStrideC,
-                                        BatchCount,
-                                        ck::tensor_operation::element_wise::PassThrough{},
-                                        ck::tensor_operation::element_wise::PassThrough{},
-                                        ck::tensor_operation::element_wise::PassThrough{});
+        std::unique_ptr<tensor_operation::device::BaseArgument> argument_ptr;
+        // false branch for multi d dl kernel
+        if constexpr(std::is_same<
+                         DeviceOp,
+                         ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
+                                                                         BLayout,
+                                                                         CLayout,
+                                                                         ADataType,
+                                                                         BDataType,
+                                                                         CDataType,
+                                                                         AElementOp,
+                                                                         BElementOp,
+                                                                         CElementOp>>::value)
+        {
+
+            argument_ptr =
+                op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                            M,
+                                            N,
+                                            K,
+                                            StrideA,
+                                            StrideB,
+                                            StrideC,
+                                            BatchStrideA,
+                                            BatchStrideB,
+                                            BatchStrideC,
+                                            BatchCount,
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{});
+        }
+        else
+        {
+            argument_ptr =
+                op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                            {},
+                                            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                            M,
+                                            N,
+                                            K,
+                                            BatchCount,
+                                            StrideA,
+                                            StrideB,
+                                            {},
+                                            StrideC,
+                                            BatchStrideA,
+                                            BatchStrideB,
+                                            {},
+                                            BatchStrideC,
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{});
+        }
 
         auto invoker_ptr = op_ptr->MakeInvokerPointer();
 
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index c9fccc258..6f768e0ae 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -34,6 +34,7 @@ set(PROFILER_SOURCES
     profile_grouped_gemm_fastgelu.cpp
     profile_contraction_bilinear.cpp
     profile_contraction_scale.cpp
+    profile_batched_gemm_multi_d.cpp
 )
 
 set(PROFILER_EXECUTABLE ckProfiler)
@@ -77,5 +78,5 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgel
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool_fwd_instance)
-
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
diff --git a/profiler/src/profile_batched_gemm.cpp b/profiler/src/profile_batched_gemm.cpp
index dc83e25b4..222532b7b 100644
--- a/profiler/src/profile_batched_gemm.cpp
+++ b/profiler/src/profile_batched_gemm.cpp
@@ -10,6 +10,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
 #include "profiler_operation_registry.hpp"
 
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+
 enum struct GemmMatrixLayout
 {
     MK_KN_MN, // 0
@@ -78,55 +80,72 @@ int profile_batched_gemm(int argc, char* argv[])
     using Row = ck::tensor_layout::gemm::RowMajor;
     using Col = ck::tensor_layout::gemm::ColumnMajor;
 
-    auto profile = [&](auto a_type,
-                       auto b_type,
-                       auto c_type,
-                       auto a_layout,
-                       auto b_layout,
-                       auto c_layout) {
-        using ADataType = decltype(a_type);
-        using BDataType = decltype(b_type);
-        using CDataType = decltype(c_type);
-
-        using ALayout = decltype(a_layout);
-        using BLayout = decltype(b_layout);
-        using CLayout = decltype(c_layout);
-
-        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
-        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
-        const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
-
-        const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
-        const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
-        const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
-
-        const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
-        const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
-        const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
-
-        const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
-        const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
-        const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
-
-        bool pass = ck::profiler::
-            profile_batched_gemm_impl<ADataType, BDataType, CDataType, ALayout, BLayout, CLayout>(
-                do_verification,
-                init_method,
-                do_log,
-                time_kernel,
-                M,
-                N,
-                K,
-                BatchStrideA_,
-                BatchStrideB_,
-                BatchStrideC_,
-                StrideA_,
-                StrideB_,
-                StrideC_,
-                BatchCount);
-
-        return pass ? 0 : 1;
-    };
+    auto profile =
+        [&](auto a_type, auto b_type, auto c_type, auto a_layout, auto b_layout, auto c_layout) {
+            using ADataType = decltype(a_type);
+            using BDataType = decltype(b_type);
+            using CDataType = decltype(c_type);
+
+            using ALayout = decltype(a_layout);
+            using BLayout = decltype(b_layout);
+            using CLayout = decltype(c_layout);
+
+            const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+            const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+            const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+
+            const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
+            const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
+            const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
+
+            const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
+            const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
+            const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
+
+            const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
+            const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
+            const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
+
+            using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+            using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
+            bool pass = ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                                BDataType,
+                                                                CDataType,
+                                                                ALayout,
+                                                                BLayout,
+                                                                CLayout,
+                                                                AElementOp,
+                                                                BElementOp,
+                                                                CElementOp,
+                                                                DeviceOp>(do_verification,
+                                                                          init_method,
+                                                                          do_log,
+                                                                          time_kernel,
+                                                                          M,
+                                                                          N,
+                                                                          K,
+                                                                          BatchStrideA_,
+                                                                          BatchStrideB_,
+                                                                          BatchStrideC_,
+                                                                          StrideA_,
+                                                                          StrideB_,
+                                                                          StrideC_,
+                                                                          BatchCount);
+
+            return pass ? 0 : 1;
+        };
 
     if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
     {
diff --git a/profiler/src/profile_batched_gemm_multi_d.cpp b/profiler/src/profile_batched_gemm_multi_d.cpp
new file mode 100644
index 000000000..98b462d95
--- /dev/null
+++ b/profiler/src/profile_batched_gemm_multi_d.cpp
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdint>
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_batched_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
+
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+
+enum struct GemmDataType
+{
+    F16_F16_F16,    // 0
+    INT8_INT8_INT8, // 1
+};
+
+#define OP_NAME "batched_gemm_multi_d"
+#define OP_DESC "Batched GEMM multi D"
+
+int profile_batched_gemm_multi_d(int argc, char* argv[])
+{
+    if(argc != 18)
+    {
+        // clang-format off
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp16; 1: int8)\n");
+        printf("arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];\n");
+        printf("                     1: A[g, m, k] * B[g, n, k] = C[g, m, n];\n");
+        printf("                     2: A[g, k, m] * B[g, k, n] = C[g, m, n];\n");
+        printf("                     3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
+        printf("arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount\n");
+        // clang-format on
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideC = std::stoi(argv[13]);
+
+    const int BatchStrideA = std::stoi(argv[14]);
+    const int BatchStrideB = std::stoi(argv[15]);
+    const int BatchStrideC = std::stoi(argv[16]);
+
+    const int BatchCount = std::stoi(argv[17]);
+
+    using F16  = ck::half_t;
+    using INT8 = int8_t;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile =
+        [&](auto a_type, auto b_type, auto c_type, auto a_layout, auto b_layout, auto c_layout) {
+            using ADataType  = decltype(a_type);
+            using BDataType  = decltype(b_type);
+            using CDataType  = decltype(c_type);
+            using DsDataType = ck::Tuple<>;
+
+            using ALayout  = decltype(a_layout);
+            using BLayout  = decltype(b_layout);
+            using CLayout  = decltype(c_layout);
+            using DsLayout = ck::Tuple<>;
+
+            const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+            const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+            const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+
+            const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
+            const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
+            const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
+
+            const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
+            const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
+            const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
+
+            const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
+            const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
+            const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
+
+            using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+            using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemmMultiD<ALayout,
+                                                                                   BLayout,
+                                                                                   DsLayout,
+                                                                                   CLayout,
+                                                                                   ADataType,
+                                                                                   BDataType,
+                                                                                   DsDataType,
+                                                                                   CDataType,
+                                                                                   AElementOp,
+                                                                                   BElementOp,
+                                                                                   CElementOp>;
+
+            bool pass = ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                                BDataType,
+                                                                CDataType,
+                                                                ALayout,
+                                                                BLayout,
+                                                                CLayout,
+                                                                AElementOp,
+                                                                BElementOp,
+                                                                CElementOp,
+                                                                DeviceOp>(do_verification,
+                                                                          init_method,
+                                                                          do_log,
+                                                                          time_kernel,
+                                                                          M,
+                                                                          N,
+                                                                          K,
+                                                                          BatchStrideA_,
+                                                                          BatchStrideB_,
+                                                                          BatchStrideC_,
+                                                                          StrideA_,
+                                                                          StrideB_,
+                                                                          StrideC_,
+                                                                          BatchCount);
+
+            return pass ? 0 : 1;
+        };
+
+    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Col{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Col{}, Col{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_multi_d);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index dad9b53ce..e3385b9dd 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -58,6 +58,7 @@ add_subdirectory(elementwise_normalization)
 add_subdirectory(batchnorm)
 add_subdirectory(contraction)
 add_subdirectory(pool_fwd)
+add_subdirectory(batched_gemm_multi_d)
 if(GPU_TARGETS MATCHES "gfx1100")
     add_subdirectory(wmma_op)
 endif()
diff --git a/test/batched_gemm/batched_gemm_bf16.cpp b/test/batched_gemm/batched_gemm_bf16.cpp
index fa1652e99..5d12a1e95 100644
--- a/test/batched_gemm/batched_gemm_bf16.cpp
+++ b/test/batched_gemm/batched_gemm_bf16.cpp
@@ -5,6 +5,8 @@
 
 #include "profiler/profile_batched_gemm_impl.hpp"
 
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+
 namespace {
 using ADataType = ck::bhalf_t;
 using BDataType = ck::bhalf_t;
@@ -12,6 +14,8 @@ using CDataType = ck::bhalf_t;
 
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace
 
 int main()
@@ -23,21 +27,87 @@ int main()
 
     bool pass = true;
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    using namespace ck::tensor_operation::device;
+
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
 
     std::cout << "test BatchedGEMM bf16: " << (pass ? "Pass" : "Fail") << std::endl;
     return pass ? 0 : 1;
diff --git a/test/batched_gemm/batched_gemm_fp16.cpp b/test/batched_gemm/batched_gemm_fp16.cpp
index 3df4912aa..a2b61d951 100644
--- a/test/batched_gemm/batched_gemm_fp16.cpp
+++ b/test/batched_gemm/batched_gemm_fp16.cpp
@@ -5,6 +5,8 @@
 
 #include "profiler/profile_batched_gemm_impl.hpp"
 
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+
 namespace {
 using ADataType = ck::half_t;
 using BDataType = ck::half_t;
@@ -12,6 +14,8 @@ using CDataType = ck::half_t;
 
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace
 
 int main()
@@ -23,21 +27,87 @@ int main()
 
     bool pass = true;
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    using namespace ck::tensor_operation::device;
+
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
 
     std::cout << "test BatchedGEMM fp16: " << (pass ? "Pass" : "Fail") << std::endl;
     return pass ? 0 : 1;
diff --git a/test/batched_gemm/batched_gemm_fp32.cpp b/test/batched_gemm/batched_gemm_fp32.cpp
index 2d808441b..2b18d166e 100644
--- a/test/batched_gemm/batched_gemm_fp32.cpp
+++ b/test/batched_gemm/batched_gemm_fp32.cpp
@@ -5,6 +5,8 @@
 
 #include "profiler/profile_batched_gemm_impl.hpp"
 
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+
 namespace {
 using ADataType = float;
 using BDataType = float;
@@ -12,6 +14,8 @@ using CDataType = float;
 
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace
 
 int main()
@@ -23,21 +27,87 @@ int main()
 
     bool pass = true;
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    using namespace ck::tensor_operation::device;
+
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
 
     std::cout << "test BatchedGEMM fp32: " << (pass ? "Pass" : "Fail") << std::endl;
     return pass ? 0 : 1;
diff --git a/test/batched_gemm/batched_gemm_int8.cpp b/test/batched_gemm/batched_gemm_int8.cpp
index ed233a5ad..f607eaa84 100644
--- a/test/batched_gemm/batched_gemm_int8.cpp
+++ b/test/batched_gemm/batched_gemm_int8.cpp
@@ -5,6 +5,8 @@
 
 #include "profiler/profile_batched_gemm_impl.hpp"
 
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+
 namespace {
 using ADataType = int8_t;
 using BDataType = int8_t;
@@ -12,6 +14,8 @@ using CDataType = int8_t;
 
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace
 
 int main()
@@ -23,21 +27,87 @@ int main()
 
     bool pass = true;
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    using namespace ck::tensor_operation::device;
+
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
 
-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
 
     std::cout << "test BatchedGEMM int8: " << (pass ? "Pass" : "Fail") << std::endl;
     return pass ? 0 : 1;
diff --git a/test/batched_gemm_multi_d/CMakeLists.txt b/test/batched_gemm_multi_d/CMakeLists.txt
new file mode 100644
index 000000000..45a306551
--- /dev/null
+++ b/test/batched_gemm_multi_d/CMakeLists.txt
@@ -0,0 +1,5 @@
+# TODO: Enable for gfx90a after complier fix
+if(NOT GPU_TARGETS MATCHES "gfx90a")
+    add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp)
+    target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance)
+endif()
diff --git a/test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp b/test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp
new file mode 100644
index 000000000..4a8265403
--- /dev/null
+++ b/test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_batched_gemm_impl.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
+
+namespace {
+using F16 = ck::half_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using Empty_Tuple = ck::Tuple<>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+template <typename Tuple>
+class TestBatchedGemmMultiD : public ::testing::Test
+{
+    protected:
+    using ALayout = std::tuple_element_t<0, Tuple>;
+    using BLayout = std::tuple_element_t<1, Tuple>;
+    using CLayout = std::tuple_element_t<2, Tuple>;
+
+    static constexpr int M          = 512;
+    static constexpr int N          = 256;
+    static constexpr int K          = 128;
+    static constexpr int BatchCount = 3;
+
+    template <typename DataType>
+    void Run()
+    {
+        using namespace ck::tensor_operation::device;
+
+        const bool pass =
+            ck::profiler::profile_batched_gemm_impl<DataType,
+                                                    DataType,
+                                                    DataType,
+                                                    ALayout,
+                                                    BLayout,
+                                                    CLayout,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    DeviceBatchedGemmMultiD<ALayout,
+                                                                            BLayout,
+                                                                            Empty_Tuple,
+                                                                            CLayout,
+                                                                            DataType,
+                                                                            DataType,
+                                                                            Empty_Tuple,
+                                                                            DataType,
+                                                                            PassThrough,
+                                                                            PassThrough,
+                                                                            PassThrough>>(
+                true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+        EXPECT_TRUE(pass);
+    }
+};
+
+using KernelTypes = ::testing::Types<std::tuple<Row, Row, Row>,
+                                     std::tuple<Row, Col, Row>,
+                                     std::tuple<Col, Row, Row>,
+                                     std::tuple<Col, Col, Row>>;
+} // namespace
+
+TYPED_TEST_SUITE(TestBatchedGemmMultiD, KernelTypes);
+
+TYPED_TEST(TestBatchedGemmMultiD, f16) { this->template Run<F16>(); }
+
+TYPED_TEST(TestBatchedGemmMultiD, int8) { this->template Run<int8_t>(); }
-- 
GitLab


From a35456a3f4ab8565c6a263e9383b1da84c33a3ed Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Mon, 12 Jun 2023 08:38:46 -0500
Subject: [PATCH 57/71] Fix arg order (#751)

---
 client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp | 2 +-
 client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
index 5965e9d1d..8d2a8c234 100644
--- a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
@@ -76,7 +76,7 @@ int main(int argc, char* argv[])
         StrideA  = std::stoi(argv[4]);
         StrideB  = std::stoi(argv[5]);
         StrideD0 = std::stoi(argv[6]);
-        StrideE  = std::stoi(argv[8]);
+        StrideE  = std::stoi(argv[7]);
     }
     else
     {
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
index 319fdb0b0..c02df018f 100644
--- a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
@@ -72,7 +72,7 @@ int main(int argc, char* argv[])
 
         StrideA = std::stoi(argv[4]);
         StrideB = std::stoi(argv[5]);
-        StrideE = std::stoi(argv[8]);
+        StrideE = std::stoi(argv[6]);
     }
     else
     {
-- 
GitLab


From 54b68eb343e9b29af024f4a5c4ac57e4e225c8fc Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Wed, 14 Jun 2023 16:06:56 -0500
Subject: [PATCH 58/71] Add generic kernel instances for
 ck::tensor_operation::device::DeviceGemmMultipleD (#741)

* Add generic instance gemm_add_add_fastgelu

* Add a client example for generic gemm_add_add_fastgelu

* Update CMakeLists

* Format

* Format

* Add generic instance gemm_add_fastgelu

* Format

* Add a gemm_add_fastgelu client example

* Format

* Add generic instance gemm_fastgelu

* Format

* Fix argument order

* Add gemm_fastgelu client example

* Add exceptions if argument is not supported
---
 .../02_gemm_add_add_fastgelu/CMakeLists.txt   |  14 ++
 .../gemm_add_add_fastgelu_generic.cpp         | 176 ++++++++++++++++++
 .../gemm_add_fastgelu_generic.cpp             | 169 +++++++++++++++++
 .../gemm_fastgelu_generic.cpp                 | 162 ++++++++++++++++
 ...16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp |  14 ++
 ...16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp |  14 ++
 ...16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp |  14 ++
 ...16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp |  14 ++
 ...e_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp |  14 ++
 ...e_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp |  14 ++
 ...e_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp |  14 ++
 ...e_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp |  14 ++
 ..._shuffle_f16_f16_f16_km_kn_mn_instance.cpp |  12 ++
 ..._shuffle_f16_f16_f16_km_nk_mn_instance.cpp |  12 ++
 ..._shuffle_f16_f16_f16_mk_kn_mn_instance.cpp |  12 ++
 ..._shuffle_f16_f16_f16_mk_nk_mn_instance.cpp |  12 ++
 16 files changed, 681 insertions(+)
 create mode 100644 client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp
 create mode 100644 client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp
 create mode 100644 client_example/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp

diff --git a/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt b/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
index b7b724ccc..ba2952022 100644
--- a/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
@@ -11,3 +11,17 @@ target_link_libraries(client_gemm_fastgelu PRIVATE composable_kernel::device_ope
 
 add_dependencies(client_gemm_fastgelu_examples client_gemm_add_add_fastgelu client_gemm_add_fastgelu
                  client_gemm_fastgelu)
+
+add_custom_target(client_gemm_fastgelu_generic_examples)
+
+add_executable(client_gemm_add_add_fastgelu_generic gemm_add_add_fastgelu_generic.cpp)
+target_link_libraries(client_gemm_add_add_fastgelu_generic PRIVATE composable_kernel::device_operations)
+
+add_executable(client_gemm_add_fastgelu_generic gemm_add_fastgelu_generic.cpp)
+target_link_libraries(client_gemm_add_fastgelu_generic PRIVATE composable_kernel::device_operations)
+
+add_executable(client_gemm_fastgelu_generic gemm_fastgelu_generic.cpp)
+target_link_libraries(client_gemm_fastgelu_generic PRIVATE composable_kernel::device_operations)
+
+add_dependencies(client_gemm_fastgelu_generic_examples client_gemm_add_add_fastgelu_generic 
+                 client_gemm_add_fastgelu_generic client_gemm_fastgelu_generic)
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp
new file mode 100644
index 000000000..2ed942f0a
--- /dev/null
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include <stdexcept>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+
+using ADataType  = F16;
+using BDataType  = F16;
+using D0DataType = F16;
+using D1DataType = F16;
+using EDataType  = F16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using ELayout  = Row;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA  = 4096;
+    ck::index_t StrideB  = 4096;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = 4096;
+    ck::index_t StrideE  = 4096;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 9)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+
+        StrideA  = std::stoi(argv[4]);
+        StrideB  = std::stoi(argv[5]);
+        StrideD0 = std::stoi(argv[6]);
+        StrideD1 = std::stoi(argv[7]);
+        StrideE  = std::stoi(argv[8]);
+    }
+    else
+    {
+        printf("arg1 to 8: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
+        exit(0);
+    }
+
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
+                                      f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+    SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) *
+                                      f_matrix_space_size(M, N, StrideD1, D1Layout{}));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<D0Layout, D1Layout>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<D0DataType, D1DataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::AddAddFastGelu>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    // get generic instance
+    auto& op_ptr = op_ptrs[0];
+
+    std::cout << "Run the generic instance without timing: " << op_ptr->GetTypeString()
+              << std::endl;
+
+    // run the generic instance
+    auto argument_ptr =
+        op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
+                                                               d1_m_n_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    M,
+                                    N,
+                                    K,
+                                    StrideA,
+                                    StrideB,
+                                    std::array<ck::index_t, 2>{StrideD0, StrideD1},
+                                    StrideE,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+
+    auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+    if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+    {
+        invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+    }
+    else
+    {
+        throw std::runtime_error(
+            "Generic instance should be suitable for various input lengths/strides");
+    }
+
+    std::cout << "Done" << std::endl;
+
+    return 0;
+}
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp
new file mode 100644
index 000000000..644b428fc
--- /dev/null
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include <stdexcept>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddFastGelu;
+
+using ADataType  = F16;
+using BDataType  = F16;
+using D0DataType = F16;
+using EDataType  = F16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using ELayout  = Row;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA  = 4096;
+    ck::index_t StrideB  = 4096;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideE  = 4096;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 8)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+
+        StrideA  = std::stoi(argv[4]);
+        StrideB  = std::stoi(argv[5]);
+        StrideD0 = std::stoi(argv[6]);
+        StrideE  = std::stoi(argv[7]);
+    }
+    else
+    {
+        printf("arg1 to 7: M, N, K, StrideA, StrideB, StrideD0, StrideE\n");
+        exit(0);
+    }
+
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
+                                      f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<D0Layout>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<D0DataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::AddFastGelu>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    // get generic instance
+    auto& op_ptr = op_ptrs[0];
+
+    std::cout << "Run the generic instance without timing: " << op_ptr->GetTypeString()
+              << std::endl;
+
+    // run the generic instance
+    auto argument_ptr =
+        op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{d0_m_n_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    M,
+                                    N,
+                                    K,
+                                    StrideA,
+                                    StrideB,
+                                    std::array<ck::index_t, 1>{StrideD0},
+                                    StrideE,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+
+    auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+    if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+    {
+        invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+    }
+    else
+    {
+        throw std::runtime_error(
+            "Generic instance should be suitable for various input lengths/strides");
+    }
+
+    std::cout << "Done" << std::endl;
+
+    return 0;
+}
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp
new file mode 100644
index 000000000..482e93b42
--- /dev/null
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include <stdexcept>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using FastGelu    = ck::tensor_operation::element_wise::FastGelu;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = FastGelu;
+
+using ADataType = F16;
+using BDataType = F16;
+using EDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideE = 4096;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 7)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+
+        StrideA = std::stoi(argv[4]);
+        StrideB = std::stoi(argv[5]);
+        StrideE = std::stoi(argv[6]);
+    }
+    else
+    {
+        printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideE\n");
+        exit(0);
+    }
+
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::FastGelu>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    // get generic instance
+    auto& op_ptr = op_ptrs[0];
+
+    std::cout << "Run the generic instance without timing: " << op_ptr->GetTypeString()
+              << std::endl;
+
+    // run the generic instance
+    auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                    b_device_buf.GetDeviceBuffer(),
+                                                    {},
+                                                    e_device_buf.GetDeviceBuffer(),
+                                                    M,
+                                                    N,
+                                                    K,
+                                                    StrideA,
+                                                    StrideB,
+                                                    {},
+                                                    StrideE,
+                                                    a_element_op,
+                                                    b_element_op,
+                                                    cde_element_op);
+
+    auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+    if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+    {
+        invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+    }
+    else
+    {
+        throw std::runtime_error(
+            "Generic instance should be suitable for various input lengths/strides");
+    }
+
+    std::cout << "Done" << std::endl;
+
+    return 0;
+}
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
index 7b65f8737..125fbc21a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
@@ -36,6 +36,17 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // e = elementwise((a * b), d0, d1)
 // outout: e[m, n]
 // input: a[k, m], b[k, n], d0[m, n], d1[m, n]
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_generic_instance =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances =
     std::tuple<
         // clang-format off
@@ -139,6 +150,9 @@ void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn
                                                     PassThrough,
                                                     AddAddFastGelu>>>& instances)
 {
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_generic_instance{});
     add_device_operation_instances(
         instances,
         device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances{});
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
index a9fef5c60..cc33692d7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
@@ -36,6 +36,17 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // e = elementwise((a * b), d0, d1)
 // outout: e[m, n]
 // input: a[k, m], b[n, k], d0[m, n], d1[m, n]
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_generic_instance =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances =
     std::tuple<
         // clang-format off
@@ -139,6 +150,9 @@ void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn
                                                     PassThrough,
                                                     AddAddFastGelu>>>& instances)
 {
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_generic_instance{});
     add_device_operation_instances(
         instances,
         device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances{});
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
index c3b1dfcca..704787a08 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -36,6 +36,17 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // e = elementwise((a * b), d0, d1)
 // outout: e[m, n]
 // input: a[m, k], b[k, n], d0[m, n], d1[m, n]
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_generic_instance =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances =
     std::tuple<
         // clang-format off
@@ -139,6 +150,9 @@ void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn
                                                     PassThrough,
                                                     AddAddFastGelu>>>& instances)
 {
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_generic_instance{});
     add_device_operation_instances(
         instances,
         device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances{});
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
index 8338b34a4..d64c9ec5e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -36,6 +36,17 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // e = elementwise((a * b), d0, d1)
 // outout: e[m, n]
 // input: a[m, k], b[n, k], d0[m, n], d1[m ,n]
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_generic_instance =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances =
     std::tuple<
         // clang-format off
@@ -130,6 +141,9 @@ void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn
                                                     PassThrough,
                                                     AddAddFastGelu>>>& instances)
 {
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_generic_instance{});
     add_device_operation_instances(
         instances,
         device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances{});
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
index 357dc91aa..e68bd8e7e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
@@ -21,6 +21,17 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // e = elementwise((a * b), d0)
 // outout: e[m, n]
 // input: a[k, m], b[k, n], d0[m, n]
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_generic_instance =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances = std::tuple<
     // clang-format off
         //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
@@ -123,6 +134,9 @@ void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_inst
                                                     PassThrough,
                                                     AddFastGelu>>>& instances)
 {
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_generic_instance{});
     add_device_operation_instances(
         instances, device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances{});
     add_device_operation_instances(
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
index 65b94d087..5aaa2e8fe 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
@@ -21,6 +21,17 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // e = elementwise((a * b), d0, d1)
 // outout: e[m, n]
 // input: a[k, m], b[n, k], d0[m, n], d1[m, n]
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_generic_instance =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances = std::tuple<
     // clang-format off
         //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
@@ -123,6 +134,9 @@ void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_inst
                                                     PassThrough,
                                                     AddFastGelu>>>& instances)
 {
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_generic_instance{});
     add_device_operation_instances(
         instances, device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances{});
     add_device_operation_instances(
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
index 00cd07bab..7a2a3dbaf 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
@@ -21,6 +21,17 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // e = elementwise((a * b), d0, d1)
 // outout: e[m, n]
 // input: a[m, k], b[k, n], d0[m, n], d1[m, n]
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_generic_instance =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances = std::tuple<
     // clang-format off
         //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
@@ -123,6 +134,9 @@ void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_inst
                                                     PassThrough,
                                                     AddFastGelu>>>& instances)
 {
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_generic_instance{});
     add_device_operation_instances(
         instances, device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances{});
     add_device_operation_instances(
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
index 9955a206b..fa3360997 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
@@ -21,6 +21,17 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // e = elementwise((a * b), d0, d1)
 // outout: e[m, n]
 // input: a[m, k], b[n, k], d0[m, n], d1[m ,n]
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_generic_instance =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances = std::tuple<
     // clang-format off
         //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
@@ -114,6 +125,9 @@ void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_inst
                                                     PassThrough,
                                                     AddFastGelu>>>& instances)
 {
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_generic_instance{});
     add_device_operation_instances(
         instances, device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances{});
     add_device_operation_instances(
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
index c5aa59f91..803c44c7f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
@@ -21,6 +21,16 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // e = elementwise((a * b))
 // outout: e[m, n]
 // input: a[k, m], b[k, n]
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_generic_instance = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::tuple<
     // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
@@ -122,6 +132,8 @@ void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
                                                     PassThrough,
                                                     FastGelu>>>& instances)
 {
+    add_device_operation_instances(
+        instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_generic_instance{});
     add_device_operation_instances(
         instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances{});
     add_device_operation_instances(
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
index e71b269b8..9b9ef3db2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
@@ -21,6 +21,16 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // e = elementwise((a * b))
 // outout: e[m, n]
 // input: a[k, m], b[k, n]
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_generic_instance = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances = std::tuple<
     // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
@@ -122,6 +132,8 @@ void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
                                                     PassThrough,
                                                     FastGelu>>>& instances)
 {
+    add_device_operation_instances(
+        instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_generic_instance{});
     add_device_operation_instances(
         instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances{});
     add_device_operation_instances(
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
index fdf63f811..1a0b6c9d1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -21,6 +21,16 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // e = elementwise((a * b))
 // outout: e[m, n]
 // input: a[m, k], b[k, n]
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_generic_instance = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances = std::tuple<
     // clang-format off
         //##############################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
@@ -122,6 +132,8 @@ void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
                                                     PassThrough,
                                                     FastGelu>>>& instances)
 {
+    add_device_operation_instances(
+        instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_generic_instance{});
     add_device_operation_instances(
         instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances{});
     add_device_operation_instances(
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
index 4c98a9d5e..18b1c0e99 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -21,6 +21,16 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // e = elementwise((a * b))
 // outout: e[m, n]
 // input: a[m, k], b[n, k]
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_generic_instance = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances = std::tuple<
     // clang-format off
         //##############################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
@@ -113,6 +123,8 @@ void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
                                                     PassThrough,
                                                     FastGelu>>>& instances)
 {
+    add_device_operation_instances(
+        instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_generic_instance{});
     add_device_operation_instances(
         instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances{});
     add_device_operation_instances(
-- 
GitLab


From d1838d328c2bf7119af5dddc522ccf37dba33c3f Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 14 Jun 2023 16:44:13 -0700
Subject: [PATCH 59/71] Fix the daily CI job with latest staging compiler.
 (#753)

* fix CI builds with latest staging compiler

* remove mount flags from dockerfile
---
 Dockerfile | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 710db05c0..bc4776806 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,20 +12,20 @@ RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
 RUN chmod 1777 /tmp
 RUN apt-get update
 RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
-RUN --mount=type=ssh if [ "$ROCMVERSION" != "5.6" ]; then \
+RUN if [ "$ROCMVERSION" != "5.6" ]; then \
         wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
         sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"; \
     elif [ "$ROCMVERSION" = "5.6" ] && [ "$compiler_version" = "" ]; then \
          sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \
          apt update && apt-get install -y ./amd-nonfree-radeon_20.04-1_all.deb && \
-         amdgpu-repo --amdgpu-build=1567752 --rocm-build=compute-rocm-dkms-no-npi-hipclang/11914; \
-    elif [ "$ROCMVERSION" = "5.6" ] && [ "$compiler_version" = "rc3" ]; then \
+         amdgpu-repo --amdgpu-build=1567752 --rocm-build=compute-rocm-dkms-no-npi-hipclang/11914 && \
+         amdgpu-install -y --usecase=rocm --no-dkms; \
+    elif [ "$ROCMVERSION" = "5.6" ] && [ "$compiler_version" = "rc3" ] || [ "$compiler_version" = "amd-stg-open" ]; then \
          sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.6-20.04-1_all.deb" && \
          apt update && apt-get install -y ./amdgpu-install-internal_5.6-20.04-1_all.deb && \
          sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.6 rel-45  > /etc/apt/sources.list.d/rocm-build.list' && \
-         amdgpu-repo --amdgpu-build=1602498; \
+         amdgpu-repo --amdgpu-build=1602498 && amdgpu-install -y --usecase=rocm --no-dkms; \
     fi
-RUN amdgpu-install -y --usecase=rocm --no-dkms
 
 RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
@@ -105,7 +105,7 @@ ENV compiler_commit=$compiler_commit
 RUN sh -c "echo compiler version = '$compiler_version'"
 RUN sh -c "echo compiler commit = '$compiler_commit'"
 
-RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ]; then \
+RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ]; then \
         git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
         cd llvm-project && mkdir build && cd build && \
         cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
@@ -113,7 +113,7 @@ RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler
     else echo "using the release compiler"; \
     fi
 
-RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ]; then \
+RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ]; then \
         git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
         cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
         cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
-- 
GitLab


From c5f6ec842c83a386b78dc0098fcdf081586df309 Mon Sep 17 00:00:00 2001
From: Qianfeng <qianfeng.zhang@amd.com>
Date: Thu, 15 Jun 2023 23:13:59 +0800
Subject: [PATCH 60/71] Using number of compute units to set  gridSize (#754)

* Add getAvailableComputeUnitCount() interface

* Use available number of compute units to set kernel grid size
---
 include/ck/host_utility/stream_utility.hpp    | 43 ++++++++++++
 .../impl/device_elementwise_2d_impl.hpp       | 69 +++++++++----------
 .../device/impl/device_elementwise_impl.hpp   | 43 ++++++------
 3 files changed, 97 insertions(+), 58 deletions(-)
 create mode 100644 include/ck/host_utility/stream_utility.hpp

diff --git a/include/ck/host_utility/stream_utility.hpp b/include/ck/host_utility/stream_utility.hpp
new file mode 100644
index 000000000..ef05f2e26
--- /dev/null
+++ b/include/ck/host_utility/stream_utility.hpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+#include "ck/stream_config.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
+
+static int getAvailableComputeUnitCount(const StreamConfig& stream_config)
+{
+    constexpr int MAX_MASK_DWORDS = 64;
+
+    // assume at most 64*32 = 2048 CUs
+    uint32_t cuMask[MAX_MASK_DWORDS];
+
+    for(int i = 0; i < MAX_MASK_DWORDS; i++)
+        cuMask[i] = 0;
+
+    auto countSetBits = [](uint32_t dword) {
+        int count = 0;
+
+        while(dword != 0)
+        {
+            if(dword & 0x1)
+                count++;
+
+            dword = dword >> 1;
+        };
+
+        return (count);
+    };
+
+    hip_check_error(hipExtStreamGetCUMask(stream_config.stream_id_, MAX_MASK_DWORDS, &cuMask[0]));
+
+    int ret = 0;
+
+    for(int i = 0; i < MAX_MASK_DWORDS; i++)
+        ret += countSetBits(cuMask[i]);
+
+    return (ret);
+};
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp
index c5f90e40f..02ef29e32 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp
@@ -13,6 +13,7 @@
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 
 #include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/stream_utility.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -171,10 +172,7 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple,
               inStridesArray_(inStridesArray),
               outStridesArray_(outStridesArray),
               elementwise_op_(elementwise_op),
-              blockSize_(256),
-              gridSize_(120), // FIXME - Calculate the grid size by number of CU in the future
-              num_threads_m_((gridSize_ * blockSize_) / 16),
-              num_threads_n_(16)
+              blockSize_(256)
         {
             static_assert(NumDim_m > 0, "");
             static_assert(NumDim_n > 0, "");
@@ -192,34 +190,10 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple,
                     return static_cast<DataType*>(out_dev_buffers[I.value]);
                 },
                 Number<NumOutput>{});
-
-            in_grid_2d_desc_tuple_ = generate_tuple(
-                [&](auto I) {
-                    return MakeDescriptor_MN(lengths,
-                                             inStridesArray[I.value],
-                                             gridSize_,
-                                             blockSize_,
-                                             num_threads_m_,
-                                             num_threads_n_);
-                },
-                Number<NumInput>{});
-
-            out_grid_2d_desc_tuple_ = generate_tuple(
-                [&](auto I) {
-                    return MakeDescriptor_MN(lengths,
-                                             outStridesArray[I.value],
-                                             gridSize_,
-                                             blockSize_,
-                                             num_threads_m_,
-                                             num_threads_n_);
-                },
-                Number<NumOutput>{});
         }
 
         InDataTypePointerTuple in_dev_buffers_;
         OutDataTypePointerTuple out_dev_buffers_;
-        InGrid2dDescTuple in_grid_2d_desc_tuple_;
-        OutGrid2dDescTuple out_grid_2d_desc_tuple_;
 
         std::array<index_t, NumDim> lengths_;
         std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_;
@@ -227,15 +201,38 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple,
 
         ElementwiseOperation elementwise_op_;
         index_t blockSize_;
-        index_t gridSize_;
-        index_t num_threads_m_;
-        index_t num_threads_n_;
     };
 
     struct Invoker : public BaseInvoker
     {
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
+            index_t gridSize      = getAvailableComputeUnitCount(stream_config);
+            index_t num_threads_m = (gridSize * arg.blockSize_) / 16;
+            index_t num_threads_n = 16;
+
+            auto in_grid_2d_desc_tuple = generate_tuple(
+                [&](auto I) {
+                    return MakeDescriptor_MN(arg.lengths_,
+                                             arg.inStridesArray_[I.value],
+                                             gridSize,
+                                             arg.blockSize_,
+                                             num_threads_m,
+                                             num_threads_n);
+                },
+                Number<NumInput>{});
+
+            auto out_grid_2d_desc_tuple = generate_tuple(
+                [&](auto I) {
+                    return MakeDescriptor_MN(arg.lengths_,
+                                             arg.outStridesArray_[I.value],
+                                             gridSize,
+                                             arg.blockSize_,
+                                             num_threads_m,
+                                             num_threads_n);
+                },
+                Number<NumOutput>{});
+
             const auto kernel = kernel_elementwise_2d<GridwiseElementwise,
                                                       InGrid2dDescTuple,
                                                       OutGrid2dDescTuple,
@@ -245,16 +242,16 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple,
 
             float elapsed_time = launch_and_time_kernel(stream_config,
                                                         kernel,
-                                                        dim3(arg.gridSize_),
+                                                        dim3(gridSize),
                                                         dim3(arg.blockSize_),
                                                         0,
-                                                        arg.in_grid_2d_desc_tuple_,
-                                                        arg.out_grid_2d_desc_tuple_,
+                                                        in_grid_2d_desc_tuple,
+                                                        out_grid_2d_desc_tuple,
                                                         arg.in_dev_buffers_,
                                                         arg.out_dev_buffers_,
                                                         arg.elementwise_op_,
-                                                        arg.num_threads_m_,
-                                                        arg.num_threads_n_);
+                                                        num_threads_m,
+                                                        num_threads_n);
             return elapsed_time;
         }
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp
index 5618fba51..e203468c6 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp
@@ -13,6 +13,7 @@
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 
 #include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/stream_utility.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -144,8 +145,7 @@ struct DeviceElementwiseImpl
               inStridesArray_(inStridesArray),
               outStridesArray_(outStridesArray),
               elementwise_op_(elementwise_op),
-              blockSize_(256),
-              gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future
+              blockSize_(256)
         {
             in_dev_buffers_ = generate_tuple(
                 [&](auto I) {
@@ -160,26 +160,10 @@ struct DeviceElementwiseImpl
                     return static_cast<DataType*>(out_dev_buffers[I.value]);
                 },
                 Number<NumOutput>{});
-
-            in_grid_1d_desc_tuple_ = generate_tuple(
-                [&](auto I) {
-                    return MakeDescriptor_M(
-                        lengths, inStridesArray[I.value], gridSize_, blockSize_);
-                },
-                Number<NumInput>{});
-
-            out_grid_1d_desc_tuple_ = generate_tuple(
-                [&](auto I) {
-                    return MakeDescriptor_M(
-                        lengths, outStridesArray[I.value], gridSize_, blockSize_);
-                },
-                Number<NumOutput>{});
         }
 
         InDataTypePointerTuple in_dev_buffers_;
         OutDataTypePointerTuple out_dev_buffers_;
-        InGrid1dDescTuple in_grid_1d_desc_tuple_;
-        OutGrid1dDescTuple out_grid_1d_desc_tuple_;
 
         std::array<index_t, NumDim> lengths_;
         std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_;
@@ -187,13 +171,28 @@ struct DeviceElementwiseImpl
 
         ElementwiseOperation elementwise_op_;
         index_t blockSize_;
-        index_t gridSize_;
     };
 
     struct Invoker : public BaseInvoker
     {
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
+            index_t gridSize = getAvailableComputeUnitCount(stream_config);
+
+            auto in_grid_1d_desc_tuple = generate_tuple(
+                [&](auto I) {
+                    return MakeDescriptor_M(
+                        arg.lengths_, arg.inStridesArray_[I.value], gridSize, arg.blockSize_);
+                },
+                Number<NumInput>{});
+
+            auto out_grid_1d_desc_tuple = generate_tuple(
+                [&](auto I) {
+                    return MakeDescriptor_M(
+                        arg.lengths_, arg.outStridesArray_[I.value], gridSize, arg.blockSize_);
+                },
+                Number<NumOutput>{});
+
             const auto kernel = kernel_elementwise_1d<GridwiseElementwise,
                                                       InGrid1dDescTuple,
                                                       OutGrid1dDescTuple,
@@ -203,11 +202,11 @@ struct DeviceElementwiseImpl
 
             float elapsed_time = launch_and_time_kernel(stream_config,
                                                         kernel,
-                                                        dim3(arg.gridSize_),
+                                                        dim3(gridSize),
                                                         dim3(arg.blockSize_),
                                                         0,
-                                                        arg.in_grid_1d_desc_tuple_,
-                                                        arg.out_grid_1d_desc_tuple_,
+                                                        in_grid_1d_desc_tuple,
+                                                        out_grid_1d_desc_tuple,
                                                         arg.in_dev_buffers_,
                                                         arg.out_dev_buffers_,
                                                         arg.elementwise_op_);
-- 
GitLab


From 309b1c64618c714b7f47ceb038bba68b61fa4e4e Mon Sep 17 00:00:00 2001
From: zjing14 <zhangjing14@gmail.com>
Date: Thu, 15 Jun 2023 08:19:33 -0700
Subject: [PATCH 61/71] Fixed Weight layout of grouped_conv 3d fwd (#743)

* Changed wei layout

* changed layout for examples

* fixed client example

---------

Co-authored-by: root <root@ctr-ubbsmc15.amd.com>
---
 client_example/16_convnd_fwd/common.hpp       |  4 -
 .../16_convnd_fwd/conv3d_fwd_fp16.cpp         |  4 +-
 .../16_convnd_fwd/conv3d_fwd_fp32.cpp         |  4 +-
 .../gpu/grouped_convolution_forward.hpp       | 28 +++---
 .../gpu/grouped_conv3d_fwd/CMakeLists.txt     |  8 +-
 ...dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp} | 88 +++++++++----------
 ...xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp} | 88 +++++++++----------
 ...xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp} | 88 +++++++++----------
 ...dl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp} | 88 +++++++++----------
 9 files changed, 198 insertions(+), 202 deletions(-)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/{device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp => device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp} (91%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/{device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instance.cpp => device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp} (91%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/{device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instance.cpp => device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp} (91%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/{device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp => device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp} (91%)

diff --git a/client_example/16_convnd_fwd/common.hpp b/client_example/16_convnd_fwd/common.hpp
index a6bb5aa65..449c9466e 100644
--- a/client_example/16_convnd_fwd/common.hpp
+++ b/client_example/16_convnd_fwd/common.hpp
@@ -141,14 +141,10 @@ bool run_grouped_conv_fwd(std::array<ck::index_t, NumDimSpatial + NumNonSpatialD
                 std::next(rbegin(in_strides)),
                 std::next(rbegin(in_strides), NumDimSpatial + 1));
 
-    std::rotate(
-        std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 2), rend(wei_lengths));
     std::rotate(rbegin(wei_lengths),
                 std::next(rbegin(wei_lengths)),
                 std::next(rbegin(wei_lengths), NumDimSpatial + 1));
 
-    std::rotate(
-        std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 2), rend(wei_strides));
     std::rotate(rbegin(wei_strides),
                 std::next(rbegin(wei_strides)),
                 std::next(rbegin(wei_strides), NumDimSpatial + 1));
diff --git a/client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp b/client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp
index 10f914bbe..d4455df62 100644
--- a/client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp
+++ b/client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp
@@ -11,7 +11,7 @@ using WeiDataType = ck::half_t;
 using OutDataType = ck::half_t;
 
 using InLayout  = ck::tensor_layout::convolution::NDHWGC;
-using WeiLayout = ck::tensor_layout::convolution::KZYXGC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
 using OutLayout = ck::tensor_layout::convolution::NDHWGK;
 
 static constexpr ck::index_t NumDimSpatial = 3;
@@ -38,7 +38,7 @@ int main()
                                 InLayout,
                                 WeiLayout,
                                 OutLayout>(
-               {N, Di, Hi, Wi, G, C}, {K, Z, Y, X, G, C}, {N, Do, Ho, Wo, G, K})
+               {N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
                ? EXIT_SUCCESS
                : EXIT_FAILURE;
 }
diff --git a/client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp b/client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp
index 43c98f1e9..7e8c98b60 100644
--- a/client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp
+++ b/client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp
@@ -11,7 +11,7 @@ using WeiDataType = float;
 using OutDataType = float;
 
 using InLayout  = ck::tensor_layout::convolution::NDHWGC;
-using WeiLayout = ck::tensor_layout::convolution::KZYXGC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
 using OutLayout = ck::tensor_layout::convolution::NDHWGK;
 
 static constexpr ck::index_t NumDimSpatial = 3;
@@ -38,7 +38,7 @@ int main()
                                 InLayout,
                                 WeiLayout,
                                 OutLayout>(
-               {N, Di, Hi, Wi, G, C}, {K, Z, Y, X, G, C}, {N, Do, Ho, Wo, G, K})
+               {N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
                ? EXIT_SUCCESS
                : EXIT_FAILURE;
 }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index a82ec543c..627a5ae2a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -245,11 +245,11 @@ void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances(
                                                               PassThrough,
                                                               PassThrough>>>& instances);
 
-// grouped conv3d forward, NDHWGC/KZYXGC/NDHWGK
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instances(
+// grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
                                                               NDHWGC,
-                                                              KZYXGC,
+                                                              GKZYXC,
                                                               Empty_Tuple,
                                                               NDHWGK,
                                                               BF16,
@@ -260,10 +260,10 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instances(
                                                               PassThrough,
                                                               PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instances(
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
                                                               NDHWGC,
-                                                              KZYXGC,
+                                                              GKZYXC,
                                                               Empty_Tuple,
                                                               NDHWGK,
                                                               F16,
@@ -274,10 +274,10 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instances(
                                                               PassThrough,
                                                               PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instances(
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
                                                               NDHWGC,
-                                                              KZYXGC,
+                                                              GKZYXC,
                                                               Empty_Tuple,
                                                               NDHWGK,
                                                               F32,
@@ -288,10 +288,10 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instances(
                                                               PassThrough,
                                                               PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instances(
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
                                                               NDHWGC,
-                                                              KZYXGC,
+                                                              GKZYXC,
                                                               Empty_Tuple,
                                                               NDHWGK,
                                                               int8_t,
@@ -433,28 +433,28 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
             }
         }
         else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
-                          is_same_v<WeiLayout, KZYXGC> && is_same_v<OutLayout, NDHWGK>)
+                          is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK>)
         {
             if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                          is_same_v<OutDataType, float>)
             {
-                add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(op_ptrs);
             }
             else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                               is_same_v<OutDataType, half_t>)
             {
-                add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(op_ptrs);
             }
             else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
                               is_same_v<WeiDataType, ck::bhalf_t> &&
                               is_same_v<OutDataType, ck::bhalf_t>)
             {
-                add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(op_ptrs);
             }
             else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                               is_same_v<OutDataType, int8_t>)
             {
-                add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances(op_ptrs);
             }
         }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
index 90efc09ee..cd209dbf9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -4,8 +4,8 @@ add_instance_library(device_grouped_conv3d_fwd_instance
    device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
    device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp
 
-   device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp
-   device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instance.cpp
-   device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instance.cpp
-   device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp
+   device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+   device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+   device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+   device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
similarity index 91%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
index 5e5dbc53c..d2cacf453 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -26,7 +26,7 @@ template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
 using NDHWGC = ck::tensor_layout::convolution::NDHWGC;
-using KZYXGC = ck::tensor_layout::convolution::KZYXGC;
+using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
 using NDHWGK = ck::tensor_layout::convolution::NDHWGK;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -43,7 +43,7 @@ static constexpr auto ConvFwd1x1S1P0 =
 static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 
 // in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho, wo, k]
-using device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instances =
+using device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances =
     std::tuple<
         // clang-format off
         // Default
@@ -51,64 +51,64 @@ using device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instances =
         //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
 
         // Filter1x1Pad0
         //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
 
         // Filter1x1Stride1Pad0
         //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
         // clang-format on
         >;
 
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instances(
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
                                                               NDHWGC,
-                                                              KZYXGC,
+                                                              GKZYXC,
                                                               Empty_Tuple,
                                                               NDHWGK,
                                                               BF16,
@@ -120,7 +120,7 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instances(
                                                               PassThrough>>>& instances)
 {
     add_device_operation_instances(
-        instances, device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instances{});
+        instances, device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
similarity index 91%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
index a88fe4af6..6354f0111 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -26,7 +26,7 @@ template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
 using NDHWGC = ck::tensor_layout::convolution::NDHWGC;
-using KZYXGC = ck::tensor_layout::convolution::KZYXGC;
+using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
 using NDHWGK = ck::tensor_layout::convolution::NDHWGK;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -43,7 +43,7 @@ static constexpr auto ConvFwd1x1S1P0 =
 static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 
 // in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho, wo, k]
-using device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instances =
+using device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances =
     std::tuple<
         // clang-format off
         // Default
@@ -51,64 +51,64 @@ using device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instances =
         //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
 
         // Filter1x1Pad0
         //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
 
         // Filter1x1Stride1Pad0
         //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
         // clang-format on
         >;
 
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instances(
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
                                                               NDHWGC,
-                                                              KZYXGC,
+                                                              GKZYXC,
                                                               Empty_Tuple,
                                                               NDHWGK,
                                                               F16,
@@ -120,7 +120,7 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instances(
                                                               PassThrough>>>& instances)
 {
     add_device_operation_instances(
-        instances, device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instances{});
+        instances, device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
similarity index 91%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
index f6e7e5b28..381ac5ef0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -25,7 +25,7 @@ template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
 using NDHWGC = ck::tensor_layout::convolution::NDHWGC;
-using KZYXGC = ck::tensor_layout::convolution::KZYXGC;
+using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
 using NDHWGK = ck::tensor_layout::convolution::NDHWGK;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -42,7 +42,7 @@ static constexpr auto ConvFwd1x1S1P0 =
 static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 
 // in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho, wo, k]
-using device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instances =
+using device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances =
     std::tuple<
         // clang-format off
         // Default
@@ -50,64 +50,64 @@ using device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instances =
         //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
 
         // Filter1x1Pad0
         //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
 
         // Filter1x1Stride1Pad0
         //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
         // clang-format on
         >;
 
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instances(
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
                                                               NDHWGC,
-                                                              KZYXGC,
+                                                              GKZYXC,
                                                               Empty_Tuple,
                                                               NDHWGK,
                                                               F32,
@@ -119,7 +119,7 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instances(
                                                               PassThrough>>>& instances)
 {
     add_device_operation_instances(
-        instances, device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instances{});
+        instances, device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
similarity index 91%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
index 3d303a3fa..6bd53d869 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
@@ -23,7 +23,7 @@ template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
 using NDHWGC = ck::tensor_layout::convolution::NDHWGC;
-using KZYXGC = ck::tensor_layout::convolution::KZYXGC;
+using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
 using NDHWGK = ck::tensor_layout::convolution::NDHWGK;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -40,71 +40,71 @@ static constexpr auto ConvFwd1x1S1P0 =
 static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 
 // in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho, wo, k]
-using device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instances = std::tuple<
+using device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances = std::tuple<
     // clang-format off
         // Default
         //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
 
         // Filter1x1Pad0
         //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
                                                                                                                                                                                                                 
         // Filter1x1Stride1Pad0                                                                                                                                                                                 
         //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
     // clang-format on
     >;
 
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instances(
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
                                                               NDHWGC,
-                                                              KZYXGC,
+                                                              GKZYXC,
                                                               Empty_Tuple,
                                                               NDHWGK,
                                                               int8_t,
@@ -116,7 +116,7 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instances(
                                                               PassThrough>>>& instances)
 {
     add_device_operation_instances(
-        instances, device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instances{});
+        instances, device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances{});
 }
 
 } // namespace instance
-- 
GitLab


From 027e46ee82cdb8a02313985361333c78f281b887 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 15 Jun 2023 08:20:59 -0700
Subject: [PATCH 62/71] Enable gfx941 and gfx942 architectures. (#752)

* enable gfx941/942 targets

* fix clang format

* fix the cmake logic for multiple targets

* fix cmake syntax for looping over targets

* add gfx941/942 support for gemm_xdl instances
---
 Jenkinsfile                                   |  4 ++--
 example/02_gemm_bilinear/CMakeLists.txt       | 19 +++++++++++----
 example/03_gemm_bias_relu/CMakeLists.txt      |  9 +++++--
 .../04_gemm_add_add_fastgelu/CMakeLists.txt   |  9 +++++--
 example/09_convnd_fwd/CMakeLists.txt          | 11 ++++++---
 .../CMakeLists.txt                            |  9 +++++--
 example/14_gemm_quantization/CMakeLists.txt   |  9 +++++--
 .../CMakeLists.txt                            |  9 +++++--
 example/17_convnd_bwd_data/CMakeLists.txt     |  9 +++++--
 example/18_batched_gemm_reduce/CMakeLists.txt | 10 +++++---
 .../20_grouped_conv_bwd_weight/CMakeLists.txt |  9 +++++--
 example/21_gemm_layernorm/CMakeLists.txt      |  9 +++++--
 .../CMakeLists.txt                            | 24 ++++++++++++++-----
 example/31_batched_gemm_gemm/CMakeLists.txt   | 21 ++++++++++++----
 example/35_splitK_gemm/CMakeLists.txt         |  9 +++++--
 .../CMakeLists.txt                            |  9 +++++--
 .../40_conv2d_fwd_quantization/CMakeLists.txt |  9 +++++--
 .../41_grouped_conv_conv_fwd/CMakeLists.txt   | 21 ++++++++++++----
 .../CMakeLists.txt                            |  9 +++++--
 include/ck/ck.hpp                             | 18 ++++++++------
 .../device_gemm_xdl_waveletmodel_cshuffle.hpp |  5 ++--
 ..._conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp |  5 ++--
 ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp |  5 ++--
 ...tk_contraction_multiple_d_xdl_cshuffle.hpp |  5 ++--
 ...ed_contraction_multiple_d_xdl_cshuffle.hpp |  5 ++--
 .../device_batched_gemm_e_permute_xdl.hpp     |  2 +-
 .../device_batched_gemm_gemm_xdl_cshuffle.hpp |  5 ++--
 .../impl/device_batched_gemm_multi_d_xdl.hpp  |  5 ++--
 ...ultiple_d_gemm_multiple_d_xdl_cshuffle.hpp |  5 ++--
 ...evice_batched_gemm_reduce_xdl_cshuffle.hpp |  2 +-
 ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp |  5 ++--
 ...batched_gemm_softmax_gemm_xdl_cshuffle.hpp |  5 ++--
 .../device/impl/device_batched_gemm_xdl.hpp   |  2 +-
 ...ce_contraction_multiple_d_xdl_cshuffle.hpp |  5 ++--
 ...evice_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp |  2 +-
 .../device/impl/device_gemm_multiple_d_dl.hpp |  5 ++--
 ...gemm_multiple_d_layernorm_xdl_cshuffle.hpp |  5 ++--
 ...emm_multiple_d_multiple_r_xdl_cshuffle.hpp |  5 ++--
 .../device_gemm_multiple_d_xdl_cshuffle.hpp   |  5 ++--
 .../device/impl/device_gemm_xdl_cshuffle.hpp  |  3 ++-
 .../device_gemm_xdl_layernorm_cshuffle.hpp    |  3 ++-
 ...ed_contraction_multiple_d_xdl_cshuffle.hpp |  5 ++--
 ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp |  2 +-
 ...bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp |  2 +-
 ...fwd_multiple_d_multiple_r_xdl_cshuffle.hpp |  5 ++--
 ...ouped_conv_fwd_multiple_d_xdl_cshuffle.hpp |  5 ++--
 .../device_grouped_gemm_multiple_d_dl.hpp     |  2 +-
 .../device/impl/device_grouped_gemm_xdl.hpp   |  2 +-
 ...evice_grouped_gemm_xdl_splitk_cshuffle.hpp |  2 +-
 ...e_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp |  2 +-
 .../gridwise_gemm_reduce_xdl_cshuffle_v1.hpp  |  2 +-
 .../grid/gridwise_gemm_xdl_cshuffle_v1.hpp    |  4 ++--
 ...ridwise_gemm_xdl_layernorm_cshuffle_v1.hpp |  2 +-
 .../grid/gridwise_gemm_xdlops_bwd_weight.hpp  |  2 +-
 .../gridwise_gemm_xdlops_skip_b_lds_v1.hpp    |  2 +-
 .../gpu/grid/gridwise_gemm_xdlops_v2r3.hpp    |  4 ++--
 .../gpu/grid/gridwise_gemm_xdlops_v2r4.hpp    |  2 +-
 .../gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp  |  2 +-
 .../gpu/grid/gridwise_gemm_xdlops_v3r1.hpp    |  2 +-
 .../gpu/grid/gridwise_gemm_xdlops_v3r2.hpp    |  2 +-
 .../gpu/grid/gridwise_gemm_xdlops_v3r3.hpp    |  2 +-
 include/ck/utility/amd_xdlops.hpp             |  2 +-
 script/cmake-ck-dev.sh                        |  2 +-
 script/cmake-ck-release.sh                    |  2 +-
 test/batched_gemm/CMakeLists.txt              |  9 +++++--
 test/batched_gemm_gemm/CMakeLists.txt         | 10 +++++---
 test/batched_gemm_reduce/CMakeLists.txt       |  9 +++++--
 test/batched_gemm_softmax_gemm/CMakeLists.txt | 10 +++++---
 .../CMakeLists.txt                            |  9 +++++--
 test/contraction/CMakeLists.txt               |  9 +++++--
 test/convnd_bwd_data/CMakeLists.txt           |  9 +++++--
 test/convnd_fwd/CMakeLists.txt                |  9 +++++--
 test/gemm_layernorm/CMakeLists.txt            |  9 +++++--
 test/gemm_split_k/CMakeLists.txt              |  9 +++++--
 test/grouped_convnd_bwd_weight/CMakeLists.txt | 11 ++++++---
 test/grouped_gemm/CMakeLists.txt              |  9 +++++--
 76 files changed, 346 insertions(+), 150 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index fbff349fc..8cfc5f804 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -695,8 +695,8 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx908 || gfx90a") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940" """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
                     }
                     steps{
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
diff --git a/example/02_gemm_bilinear/CMakeLists.txt b/example/02_gemm_bilinear/CMakeLists.txt
index eecec2437..7ea4564d1 100644
--- a/example/02_gemm_bilinear/CMakeLists.txt
+++ b/example/02_gemm_bilinear/CMakeLists.txt
@@ -1,6 +1,17 @@
-if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
+list(APPEND gpu_list1 gfx1100 gfx1101 gfx1102)
+list(APPEND gpu_list2 gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list1 AND target EQUAL 0)
     add_example_executable(example_gemm_bilinear_wmma_fp16 gemm_bilinear_wmma_fp16.cpp)
-endif()
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+   set(target 1)
+ endif()
+endforeach()
+
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list2 AND target EQUAL 0)
     add_example_executable(example_gemm_bilinear_xdl_fp16 gemm_bilinear_xdl_fp16.cpp)
-endif()
+   set(target 1)
+ endif()
+endforeach()
diff --git a/example/03_gemm_bias_relu/CMakeLists.txt b/example/03_gemm_bias_relu/CMakeLists.txt
index 8834a910f..2f5cba924 100644
--- a/example/03_gemm_bias_relu/CMakeLists.txt
+++ b/example/03_gemm_bias_relu/CMakeLists.txt
@@ -1,3 +1,8 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
     add_example_executable(example_gemm_bias_relu_xdl_fp16 gemm_bias_relu_xdl_fp16.cpp)
-endif()
\ No newline at end of file
+   set(target 1)
+ endif()
+endforeach()
diff --git a/example/04_gemm_add_add_fastgelu/CMakeLists.txt b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
index a706830b6..447fa9871 100644
--- a/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
@@ -1,4 +1,7 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
     add_custom_target(example_gemm_add_add_fastgelu_xdl)
 
     add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
@@ -16,4 +19,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
         add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4)
     endif(USE_BITINT_EXTENSION_INT4)
     add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int8)
-endif()
\ No newline at end of file
+   set(target 1)
+ endif()
+endforeach()
\ No newline at end of file
diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt
index 1bcf2d148..90104c163 100644
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
@@ -1,12 +1,17 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
     add_example_executable(example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp)
     add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
     add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
     add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
     # FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
     add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
-endif()
+   set(target 1)
+ endif()
+endforeach()
+
 add_example_executable(example_convnd_fwd_dl_fp16 convnd_fwd_dl_fp16.cpp)
 add_example_executable(example_convnd_fwd_dl_fp32 convnd_fwd_dl_fp32.cpp)
 add_example_executable(example_convnd_fwd_dl_int8 convnd_fwd_dl_int8.cpp)
-
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt b/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
index de26462f6..9577b4569 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
@@ -1,4 +1,7 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_custom_target(example_convnd_fwd_reduce_xdl)
    add_example_executable(example_convnd_fwd_max_xdl_int8 convnd_fwd_max_xdl_int8.cpp)
    add_example_executable_no_testing(example_convnd_fwd_max_xdl_bf16 convnd_fwd_max_xdl_bf16.cpp)
@@ -12,4 +15,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
       add_example_executable(example_convnd_fwd_max_xdl_int4 convnd_fwd_max_xdl_int4.cpp)
       add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int4)
    endif(USE_BITINT_EXTENSION_INT4)
-endif()
\ No newline at end of file
+   set(target 1)
+ endif()
+endforeach()
\ No newline at end of file
diff --git a/example/14_gemm_quantization/CMakeLists.txt b/example/14_gemm_quantization/CMakeLists.txt
index 584333e7b..72bdff5ab 100644
--- a/example/14_gemm_quantization/CMakeLists.txt
+++ b/example/14_gemm_quantization/CMakeLists.txt
@@ -2,7 +2,12 @@
 add_example_executable(example_gemm_dl_quantization_int8 gemm_dl_quantization_int8.cpp)
 
 # xdlops
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_example_executable(example_gemm_xdl_bias_relu_quantization_int8 gemm_xdl_bias_relu_quantization_int8.cpp)
    add_example_executable(example_gemm_xdl_quantization_int8 gemm_xdl_quantization_int8.cpp)
-endif()
\ No newline at end of file
+   set(target 1)
+ endif()
+endforeach()
\ No newline at end of file
diff --git a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
index 0f1ca777c..a42b427c6 100644
--- a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
+++ b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
@@ -1,4 +1,7 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_custom_target(example_gemm_reduce_xdl)
    add_custom_target(example_gemm_reduce_xdl_max)
    add_custom_target(example_gemm_reduce_xdl_mean_meansquare)
@@ -39,4 +42,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
       add_example_executable(example_gemm_max_xdl_int4 gemm_max_xdl_int4.cpp)
       add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_int4)
    endif()
-endif()
+   set(target 1)
+ endif()
+endforeach()
diff --git a/example/17_convnd_bwd_data/CMakeLists.txt b/example/17_convnd_bwd_data/CMakeLists.txt
index ed95946c4..8ab9f37bb 100644
--- a/example/17_convnd_bwd_data/CMakeLists.txt
+++ b/example/17_convnd_bwd_data/CMakeLists.txt
@@ -1,6 +1,11 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_example_executable(example_convnd_bwd_data_xdl_fp16 convnd_bwd_data_xdl_fp16.cpp)
    target_link_libraries(example_convnd_bwd_data_xdl_fp16 PRIVATE utility)
-endif()
+   set(target 1)
+ endif()
+endforeach()
 add_example_executable(example_convnd_bwd_data_dl_fp16 convnd_bwd_data_dl_fp16.cpp)
 target_link_libraries(example_convnd_bwd_data_dl_fp16 PRIVATE utility)
diff --git a/example/18_batched_gemm_reduce/CMakeLists.txt b/example/18_batched_gemm_reduce/CMakeLists.txt
index 0c3648dbf..94ed129dc 100644
--- a/example/18_batched_gemm_reduce/CMakeLists.txt
+++ b/example/18_batched_gemm_reduce/CMakeLists.txt
@@ -1,4 +1,8 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_example_executable(example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp)
-endif()
-
+   set(target 1)
+ endif()
+endforeach()
diff --git a/example/20_grouped_conv_bwd_weight/CMakeLists.txt b/example/20_grouped_conv_bwd_weight/CMakeLists.txt
index 0ee39ac84..db170decc 100644
--- a/example/20_grouped_conv_bwd_weight/CMakeLists.txt
+++ b/example/20_grouped_conv_bwd_weight/CMakeLists.txt
@@ -1,4 +1,7 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_custom_target(example_grouped_conv_bwd_weight)
 
    add_example_executable(example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp)
@@ -6,7 +9,9 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
 
    add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16
                                                  example_grouped_conv_bwd_weight_xdl_bf16)
-endif()
+   set(target 1)
+ endif()
+endforeach()
 
 add_custom_target(example_grouped_conv_bwd_weight_dl)
 
diff --git a/example/21_gemm_layernorm/CMakeLists.txt b/example/21_gemm_layernorm/CMakeLists.txt
index 7f974221b..ff870e7c6 100644
--- a/example/21_gemm_layernorm/CMakeLists.txt
+++ b/example/21_gemm_layernorm/CMakeLists.txt
@@ -1,6 +1,11 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_welford_fp16 gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp)
    add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_naive_fp16 gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp)
    add_example_executable(example_gemm_layernorm_xdl_naive_fp16 gemm_layernorm_xdl_naive_fp16.cpp)
    add_example_executable(example_gemm_xdl_layernorm_naive_single_kernel_fp16 gemm_xdl_layernorm_naive_single_kernel_fp16.cpp)
-endif()
+   set(target 1)
+ endif()
+endforeach()
diff --git a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
index 9780a64cc..1c07538b0 100644
--- a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
@@ -1,4 +1,9 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list1 gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list2 gfx1100 gfx1101 gfx1102)
+
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list1 AND target EQUAL 0)
    add_custom_target(example_grouped_conv_fwd_multiple_d)
 
    add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
@@ -17,8 +22,15 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
    endif() # USE_BITINT_EXTENSION_INT4
    add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
    add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16)
-endif()
-if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
-  add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp)
-  add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_int8 grouped_conv_fwd_bias_relu_add_wmma_int8.cpp)
-endif()
+   set(target 1)
+ endif()
+endforeach()
+
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list2 AND target EQUAL 0)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp)
+   add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_int8 grouped_conv_fwd_bias_relu_add_wmma_int8.cpp)
+   set(target 1)
+ endif()
+endforeach()
diff --git a/example/31_batched_gemm_gemm/CMakeLists.txt b/example/31_batched_gemm_gemm/CMakeLists.txt
index dd9aef94a..f8d139275 100644
--- a/example/31_batched_gemm_gemm/CMakeLists.txt
+++ b/example/31_batched_gemm_gemm/CMakeLists.txt
@@ -1,12 +1,23 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list1 gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list2 gfx908 gfx90a)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list1 AND target EQUAL 0)
    add_example_executable(example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp)
    add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp)
    add_example_executable(example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp)
-   if(NOT GPU_TARGETS MATCHES "gfx940")
-      add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
-   endif()
 
    if(USE_BITINT_EXTENSION_INT4)
       add_example_executable(example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp)
    endif(USE_BITINT_EXTENSION_INT4)
-endif()
\ No newline at end of file
+   set(target 1)
+ endif()
+endforeach()
+
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list2 AND target EQUAL 0)
+   add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
+   set(target 1)
+ endif()
+endforeach()
\ No newline at end of file
diff --git a/example/35_splitK_gemm/CMakeLists.txt b/example/35_splitK_gemm/CMakeLists.txt
index f5a6ccb24..57ac33fc9 100644
--- a/example/35_splitK_gemm/CMakeLists.txt
+++ b/example/35_splitK_gemm/CMakeLists.txt
@@ -1,4 +1,7 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_custom_target(example_splitK_gemm_xdl)
    add_example_executable(example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp)
    add_example_executable(example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp)
@@ -15,4 +18,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
       add_example_executable(example_splitK_gemm_xdl_int4 splitK_gemm_xdl_int4.cpp)
       add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_int4)
    endif()
-endif()
+   set(target 1)
+ endif()
+endforeach()
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt b/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
index 472d59c77..de48093ac 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
+++ b/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
@@ -1,8 +1,13 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_custom_target(example_grouped_conv_bwd_data)
    add_example_executable(example_grouped_conv_bwd_data_fp16 grouped_conv_bwd_data_fp16.cpp)
    add_example_executable(example_grouped_conv_bwd_data_bias_relu_fp16 grouped_conv_bwd_data_bias_relu_fp16.cpp)
 
    add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_fp16)
    add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16)
-endif()
\ No newline at end of file
+   set(target 1)
+ endif()
+endforeach()
\ No newline at end of file
diff --git a/example/40_conv2d_fwd_quantization/CMakeLists.txt b/example/40_conv2d_fwd_quantization/CMakeLists.txt
index c12ab7a34..b82a013b5 100644
--- a/example/40_conv2d_fwd_quantization/CMakeLists.txt
+++ b/example/40_conv2d_fwd_quantization/CMakeLists.txt
@@ -1,9 +1,14 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_example_executable(example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp)
    add_example_executable(example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_perchannel_quantization_int8.cpp)
    add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp)
    add_example_executable(example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp)
-endif()
+   set(target 1)
+ endif()
+endforeach()
 # Conv perlayer quantization
 add_example_executable(example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp)
 
diff --git a/example/41_grouped_conv_conv_fwd/CMakeLists.txt b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
index 14f5c284a..0c9df707b 100644
--- a/example/41_grouped_conv_conv_fwd/CMakeLists.txt
+++ b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
@@ -1,11 +1,22 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list1 gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list2 gfx908 gfx90a)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list1 AND target EQUAL 0)
    add_example_executable(example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp)
    add_example_executable(example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp)
    add_example_executable(example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp)
-   if(NOT GPU_TARGETS MATCHES "gfx940")
-      add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
-   endif()
    if(USE_BITINT_EXTENSION_INT4)
       add_example_executable(example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp)
    endif(USE_BITINT_EXTENSION_INT4)
-endif()
+   set(target 1)
+ endif()
+endforeach()
+
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list2 AND target EQUAL 0)
+   add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
+   set(target 1)
+ endif()
+endforeach()
diff --git a/example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt b/example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
index b60789ea3..14432f6e2 100644
--- a/example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
+++ b/example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
@@ -1,3 +1,8 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_example_executable(example_gemm_bias_softmax_gemm_permute gemm_bias_softmax_gemm_permute.cpp)
-endif()
+   set(target 1)
+ endif()
+endforeach()
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 314e6a813..d7ce449bb 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -31,7 +31,8 @@
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_BUFFER_RESOURCE_3RD_DWORD -1
 #elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
-    defined(__gfx90a__) || defined(__gfx940__) // for GPU code
+    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) ||                          \
+    defined(__gfx942__) // for GPU code
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
 #elif defined(__gfx1030__) // for GPU code
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
@@ -44,7 +45,7 @@
 #elif defined(__gfx803__) || defined(__gfx900__) // for GPU code
 #define CK_USE_AMD_V_MAC_F32
 #elif defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) || \
-    defined(__gfx940__) // for GPU code
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) // for GPU code
 #define CK_USE_AMD_V_FMAC_F32
 #define CK_USE_AMD_V_DOT2_F32_F16
 #define CK_USE_AMD_V_DOT4_I32_I8
@@ -53,15 +54,16 @@
 // MFMA instruction
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_USE_AMD_MFMA
-#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) // for GPU code
+#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
+    defined(__gfx942__) // for GPU code
 #define CK_USE_AMD_MFMA
 #endif
 
-#if(defined(__gfx90a__) || defined(__gfx940__))
+#if(defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
 #define CK_USE_AMD_MFMA_BF16_1K_OP
 #endif
 
-#if defined(__gfx940__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
 #define CK_USE_AMD_MFMA_GFX940
 #endif
 
@@ -84,13 +86,15 @@
 // buffer atomic add: floating point
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
-#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) // for GPU code
+#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
+    defined(__gfx942__) // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
 #else // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
 #endif
 
-#if(defined(__gfx90a__) || defined(__gfx940__)) // for GPU code
+#if(defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
+    defined(__gfx942__)) // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
 #else
 #define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
index d00e19b44..d0de101c4 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -48,7 +48,7 @@ __global__ void
             const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
@@ -418,7 +418,8 @@ struct DeviceGemm_Xdl_WaveletModel_CShuffle : public DeviceGemm<ALayout,
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
index f26974ccb..ea958a4eb 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -136,7 +136,7 @@ __global__ void
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) ||           \
     defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx940__) || defined(__gfx1100__) || \
-    defined(__gfx1101__) || defined(__gfx1102__))
+    defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
     // offset base pointer for each work-group
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -713,7 +713,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
         if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
              ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx908" ||
              ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx1100" ||
-             ck::get_device_name() == "gfx1101" || ck::get_device_name() == "gfx1102"))
+             ck::get_device_name() == "gfx1101" || ck::get_device_name() == "gfx1102" ||
+             ck::get_device_name() == "gfx941" || ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index 30e29cc8e..ab6d2716c 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -44,7 +44,7 @@ __global__ void
             const CElementwiseOperation c_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t block_id = get_block_1d_id();
@@ -682,7 +682,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
index 95517b107..f849ac799 100644
--- a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
@@ -57,7 +57,7 @@ __global__ void
             const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t num_blocks_per_batch =
@@ -940,7 +940,8 @@ struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
index bc86e78b6..46e71240c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
@@ -57,7 +57,7 @@ __global__ void
             const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t num_blocks_per_batch =
@@ -841,7 +841,8 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
index 201844586..fc080df5f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
@@ -75,7 +75,7 @@ __global__ void
                                           const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
index 09220813b..7012584aa 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
@@ -61,7 +61,7 @@ __global__ void
             const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -590,7 +590,8 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
index 2d91c620c..e00deaf00 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
@@ -84,7 +84,7 @@ __global__ void
 {
 
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
@@ -581,7 +581,8 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
index e39d8f069..d259ebffa 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
@@ -69,7 +69,7 @@ __global__ void
             const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -806,7 +806,8 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
index eff503f65..1d52416d4 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -60,7 +60,7 @@ __global__ void
             const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index 3fad319e9..ca72bcdd2 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -68,7 +68,7 @@ __global__ void
             const C0MatrixMask c0_matrix_mask)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -724,7 +724,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
 #endif
 
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
index 9310d0752..2405640ca 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -63,7 +63,7 @@ __global__ void
             const C0MatrixMask c0_matrix_mask)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -614,7 +614,8 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
index 87c58f371..b6a0567fe 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
@@ -53,7 +53,7 @@ __global__ void
         kernel_batched_gemm_xdlops_v2r3(const typename DeviceOp::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / karg.Batch);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
index b6f38698c..dd57c9089 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -53,7 +53,7 @@ __global__ void
             const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
@@ -583,7 +583,8 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
index fd8c88da9..bc6582835 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -56,7 +56,7 @@ __global__ void
             const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / num_batches);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
index 22f66931f..ad51096db 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
@@ -52,7 +52,7 @@ __global__ void
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||             \
     defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
-    defined(__gfx1101__) || defined(__gfx1102__))
+    defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
 
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(ABDataType);
@@ -555,7 +555,8 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD<ALayout,
         if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx908" ||
            ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx1030" ||
            ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx1100" ||
-           ck::get_device_name() == "gfx1101" || ck::get_device_name() == "gfx1102")
+           ck::get_device_name() == "gfx1101" || ck::get_device_name() == "gfx1102" ||
+           ck::get_device_name() == "gfx941" || ck::get_device_name() == "gfx942")
         {
             return GridwiseGemm::CheckValidity(
                 arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.e_grid_desc_m_n_);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
index 1ab836247..5afeaf98c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
@@ -64,7 +64,7 @@ __global__ void
             index_t NRaw)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemmWelford::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemmWelford::template Run<HasMainKBlockLoop>(
@@ -856,7 +856,8 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
index be174e599..3a0be837b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -61,7 +61,7 @@ __global__ void
             const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
@@ -556,7 +556,8 @@ struct DeviceGemmMultipleDMultipleR_Xdl_CShuffle
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
index bd5be99f8..d4ecea7bb 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -52,7 +52,7 @@ __global__ void
                                             const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
@@ -492,7 +492,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
index a3dda82aa..b68ecfe7b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
@@ -189,7 +189,8 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
index 14ac5420a..a2403f109 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
@@ -649,7 +649,8 @@ struct DeviceGemmLayerNorm_Xdl_CShuffle : public BaseOperator
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
index 5775ff397..4b1cc65ba 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
@@ -38,7 +38,7 @@ __global__ void
             const CDEElementwiseOperation cde_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t block_id = get_block_1d_id();
@@ -706,7 +706,8 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx940"))
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 3f6238d21..81a4d6927 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -131,7 +131,7 @@ __global__ void
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     // offset base pointer for each work-group
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
index 3bcc7bd64..4f4e0d576 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
@@ -79,7 +79,7 @@ __global__ void
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
index 71e4e28bf..af8e0e027 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -156,7 +156,7 @@ __global__ void
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
@@ -811,7 +811,8 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
                 return false;
             }
         }
-        else if(get_device_name() == "gfx90a" || get_device_name() == "gfx940")
+        else if(get_device_name() == "gfx90a" || get_device_name() == "gfx940" ||
+                get_device_name() == "gfx941" || get_device_name() == "gfx942")
         {
             if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
                            is_same_v<AccDataType, int32_t> || is_same_v<AccDataType, double>))
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
index 02458bf02..a9294f034 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
@@ -136,7 +136,7 @@ __global__ void
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     // offset base pointer for each work-group
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -685,7 +685,8 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
                 return false;
             }
         }
-        else if(get_device_name() == "gfx90a" || get_device_name() == "gfx940")
+        else if(get_device_name() == "gfx90a" || get_device_name() == "gfx940" ||
+                get_device_name() == "gfx941" || get_device_name() == "gfx942")
         {
             if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
                            is_same_v<AccDataType, int32_t> || is_same_v<AccDataType, double>))
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
index 22be58259..0190b3cee 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
@@ -41,7 +41,7 @@ __global__ void
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||              \
     defined(__gfx90a__) || defined(__gfx1030__) || defined(__gfx1100__) || defined(__gfx1101__) || \
-    defined(__gfx1102__) || defined(__gfx940__))
+    defined(__gfx1102__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t block_id = get_block_1d_id();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
index 390004756..bc9b3a4dc 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -39,7 +39,7 @@ __global__ void
                                 const CDEElementwiseOperation c_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t block_id = get_block_1d_id();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index 1ac9969f8..74f38b9db 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -35,7 +35,7 @@ __global__ void
                                        const index_t group_count)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
     __shared__ uint8_t p_shared[shared_size];
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
index c5c09e909..533559adf 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
@@ -67,7 +67,7 @@ __global__ void
             const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
index bb3e6a80b..3d1fbb73b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -55,7 +55,7 @@ __global__ void
             const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940))
+    defined(__gfx940) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
index d5552656e..7f1fb21d3 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -25,7 +25,7 @@ __global__ void
         kernel_gemm_xdl_cshuffle_v1(typename GridwiseGemm::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(
@@ -46,7 +46,7 @@ __global__ void
                                     typename GridwiseGemm::Problem problem)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid, p_b_grid, p_c_grid, p_shared, problem);
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
index d805c9fa2..867535009 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
@@ -58,7 +58,7 @@ __global__ void
             const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     // TODO ANT: separate into MMA + Epilogue
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
index da7ad1cac..0fc18bb92 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -166,7 +166,7 @@ __global__ void
                                       const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
index e9881d645..b12bcee0f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
@@ -45,7 +45,7 @@ __global__ void
             const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainK0BlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
index bd18fdb10..7b8bbd301 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -36,7 +36,7 @@ __global__ void
                                 const CGridDesc_M_N c_grid_desc_m_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
@@ -64,7 +64,7 @@ __global__ void
         kernel_gemm_xdlops_v2r3(const typename GridwiseGemm::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const auto a_grid_desc_k0_m_k1 =
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
index 33a4f2b2c..19fbee727 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
@@ -43,7 +43,7 @@ __global__ void
                                 const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
index a4b320ddf..4cd70cc90 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -31,7 +31,7 @@ __global__ void
                                              const Block2CTileMap& b2c_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
 
     __shared__ uint8_t p_shared[shared_size];
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
index df543c063..d090ba54d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -47,7 +47,7 @@ __global__ void
             const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainK0BlockLoop>(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
index ec98fc9c9..2c09e80fd 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -50,7 +50,7 @@ __global__ void
             const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
index 3a752dd74..c1bc6a8fe 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -54,7 +54,7 @@ __global__ void
             const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index f4a04e281..d00b7cd07 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -344,7 +344,7 @@ struct intrin_mfma_f64_16x16x4f64<16, 16>
     template <class FloatC>
     __device__ static void Run(const double& reg_a, const double& reg_b, FloatC& reg_c)
     {
-#if defined(__gfx90a__) || defined(__gfx940__)
+#if defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
         reg_c.template AsType<double4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64(
             reg_a, reg_b, reg_c.template AsType<double4_t>()[Number<0>{}], 0, 0, 0);
 #else
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index 426f68d44..33352a622 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -12,7 +12,7 @@ cmake
 -save-temps=$PWD"                                                                                 \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
--D GPU_TARGETS="gfx908;gfx90a;gfx940"                                                             \
+-D GPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942"                                               \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}
diff --git a/script/cmake-ck-release.sh b/script/cmake-ck-release.sh
index 787eabbf9..03d422478 100755
--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -11,7 +11,7 @@ cmake
 -D CMAKE_CXX_FLAGS="-O3"                                                                          \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=OFF                                                                                  \
--D GPU_TARGETS="gfx908;gfx90a;gfx940"                                                             \
+-D GPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942"                                               \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}
diff --git a/test/batched_gemm/CMakeLists.txt b/test/batched_gemm/CMakeLists.txt
index 5552a6c98..12142c33e 100644
--- a/test/batched_gemm/CMakeLists.txt
+++ b/test/batched_gemm/CMakeLists.txt
@@ -1,4 +1,7 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
    target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
    target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
@@ -14,4 +17,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
    add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
    target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
    target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
-endif()
\ No newline at end of file
+   set(target 1)
+ endif()
+endforeach()
\ No newline at end of file
diff --git a/test/batched_gemm_gemm/CMakeLists.txt b/test/batched_gemm_gemm/CMakeLists.txt
index eff339175..858efd837 100644
--- a/test/batched_gemm_gemm/CMakeLists.txt
+++ b/test/batched_gemm_gemm/CMakeLists.txt
@@ -1,7 +1,11 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_custom_target(test_batched_gemm_gemm)
-
    add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
    target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
    add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
-endif()
\ No newline at end of file
+   set(target 1)
+ endif()
+endforeach()
\ No newline at end of file
diff --git a/test/batched_gemm_reduce/CMakeLists.txt b/test/batched_gemm_reduce/CMakeLists.txt
index 1a8d7112a..0710f4647 100644
--- a/test/batched_gemm_reduce/CMakeLists.txt
+++ b/test/batched_gemm_reduce/CMakeLists.txt
@@ -1,5 +1,10 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
    target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility)
    target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
-endif()
+   set(target 1)
+ endif()
+endforeach()
diff --git a/test/batched_gemm_softmax_gemm/CMakeLists.txt b/test/batched_gemm_softmax_gemm/CMakeLists.txt
index dee0bb56a..984cc3c16 100644
--- a/test/batched_gemm_softmax_gemm/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm/CMakeLists.txt
@@ -1,7 +1,11 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_custom_target(test_batched_gemm_softmax_gemm)
-
    add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
    target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
    add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
-endif()
\ No newline at end of file
+   set(target 1)
+ endif()
+endforeach()
\ No newline at end of file
diff --git a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
index cb68af06c..4d04be3fa 100644
--- a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
@@ -1,4 +1,7 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_custom_target(test_batched_gemm_softmax_gemm_permute)
 
    add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
@@ -14,4 +17,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
    target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
    add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
    add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
-endif()
\ No newline at end of file
+   set(target 1)
+ endif()
+endforeach()
\ No newline at end of file
diff --git a/test/contraction/CMakeLists.txt b/test/contraction/CMakeLists.txt
index ec44151f5..1f6e0ed34 100644
--- a/test/contraction/CMakeLists.txt
+++ b/test/contraction/CMakeLists.txt
@@ -1,6 +1,11 @@
 add_gtest_executable(test_contraction test_contraction.cpp)
 target_link_libraries(test_contraction PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
     add_gtest_executable(test_contraction_interface test_contraction_interface.cpp)
     target_link_libraries(test_contraction_interface PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
-endif()
+   set(target 1)
+ endif()
+endforeach()
diff --git a/test/convnd_bwd_data/CMakeLists.txt b/test/convnd_bwd_data/CMakeLists.txt
index 4bfd21945..f734b46f5 100644
--- a/test/convnd_bwd_data/CMakeLists.txt
+++ b/test/convnd_bwd_data/CMakeLists.txt
@@ -1,4 +1,9 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_gtest_executable(test_convnd_bwd_data convnd_bwd_data.cpp)
    target_link_libraries(test_convnd_bwd_data PRIVATE utility device_conv1d_bwd_data_instance device_conv2d_bwd_data_instance device_conv3d_bwd_data_instance)
-endif()
\ No newline at end of file
+   set(target 1)
+ endif()
+endforeach()
\ No newline at end of file
diff --git a/test/convnd_fwd/CMakeLists.txt b/test/convnd_fwd/CMakeLists.txt
index 058f99026..745aceffc 100644
--- a/test/convnd_fwd/CMakeLists.txt
+++ b/test/convnd_fwd/CMakeLists.txt
@@ -1,4 +1,9 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_gtest_executable(test_convnd_fwd convnd_fwd.cpp)
    target_link_libraries(test_convnd_fwd PRIVATE utility device_conv2d_fwd_instance)
-endif()
+   set(target 1)
+ endif()
+endforeach()
diff --git a/test/gemm_layernorm/CMakeLists.txt b/test/gemm_layernorm/CMakeLists.txt
index b2a5178ac..56b8d7737 100644
--- a/test/gemm_layernorm/CMakeLists.txt
+++ b/test/gemm_layernorm/CMakeLists.txt
@@ -1,6 +1,11 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_custom_target(test_gemm_layernorm)
    add_gtest_executable(test_gemm_add_relu_add_layernorm_fp16 test_gemm_add_relu_add_layernorm_fp16.cpp)
    target_link_libraries(test_gemm_add_relu_add_layernorm_fp16 PRIVATE utility device_gemm_add_relu_add_layernorm_instance)
    add_dependencies(test_gemm_layernorm test_gemm_add_relu_add_layernorm_fp16)
-endif()
+   set(target 1)
+ endif()
+endforeach()
diff --git a/test/gemm_split_k/CMakeLists.txt b/test/gemm_split_k/CMakeLists.txt
index 2274854f8..caf30fca5 100644
--- a/test/gemm_split_k/CMakeLists.txt
+++ b/test/gemm_split_k/CMakeLists.txt
@@ -1,4 +1,9 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_gtest_executable(test_gemm_splitk test_gemm_splitk.cpp)
    target_link_libraries(test_gemm_splitk PRIVATE utility device_gemm_splitk_instance)
-endif()
+   set(target 1)
+ endif()
+endforeach()
diff --git a/test/grouped_convnd_bwd_weight/CMakeLists.txt b/test/grouped_convnd_bwd_weight/CMakeLists.txt
index da554f677..8be872bc6 100644
--- a/test/grouped_convnd_bwd_weight/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt
@@ -1,4 +1,9 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_gtest_executable(test_grouped_convnd_bwd_weight grouped_convnd_bwd_weight.cpp)
-    target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
-endif()
\ No newline at end of file
+   target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
+   set(target 1)
+ endif()
+endforeach()
\ No newline at end of file
diff --git a/test/grouped_gemm/CMakeLists.txt b/test/grouped_gemm/CMakeLists.txt
index 40f634d8b..8c57b667e 100644
--- a/test/grouped_gemm/CMakeLists.txt
+++ b/test/grouped_gemm/CMakeLists.txt
@@ -1,4 +1,7 @@
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_custom_target(test_grouped_gemm)
    add_gtest_executable(test_grouped_gemm_splitk test_grouped_gemm_splitk.cpp)
    add_gtest_executable(test_grouped_gemm_interface test_grouped_gemm_interface.cpp)
@@ -6,4 +9,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
    target_link_libraries(test_grouped_gemm_interface PRIVATE utility device_grouped_gemm_instance)
    
    add_dependencies(test_grouped_gemm test_grouped_gemm_splitk test_grouped_gemm_interface)
-endif()
+   set(target 1)
+ endif()
+endforeach()
-- 
GitLab


From d140bdc9fa251d9519055c932e169e510d7f6785 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 16 Jun 2023 12:13:16 -0700
Subject: [PATCH 63/71] do not build gfx941/942 targets during daily QA runs
 (#758)

---
 Jenkinsfile                | 2 +-
 script/cmake-ck-dev.sh     | 2 +-
 script/cmake-ck-release.sh | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 8cfc5f804..ad2baa00e 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -695,7 +695,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx908 || gfx90a") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940" """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
                     }
                     steps{
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index 33352a622..426f68d44 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -12,7 +12,7 @@ cmake
 -save-temps=$PWD"                                                                                 \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
--D GPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942"                                               \
+-D GPU_TARGETS="gfx908;gfx90a;gfx940"                                                             \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}
diff --git a/script/cmake-ck-release.sh b/script/cmake-ck-release.sh
index 03d422478..787eabbf9 100755
--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -11,7 +11,7 @@ cmake
 -D CMAKE_CXX_FLAGS="-O3"                                                                          \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=OFF                                                                                  \
--D GPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942"                                               \
+-D GPU_TARGETS="gfx908;gfx90a;gfx940"                                                             \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}
-- 
GitLab


From 0d9118226b167b8cc49360560850d4cf79936c8c Mon Sep 17 00:00:00 2001
From: Qianfeng <qianfeng.zhang@amd.com>
Date: Sat, 17 Jun 2023 12:43:11 +0800
Subject: [PATCH 64/71] Padded Generic Kernel Instance  (#730)

* Add NumReduceDim template parameter to DeviceSoftmax and Softmax client API to simplify instances collecting

* Move the generic kernel instance to be the first of the instance list for elementwise op of normalization

* Add GetGenericInstance() interface for DeviceOperationInstanceFactory class of DeviceSoftmax

* Add testing of GetGenericInstance() in client_example of Softmax

* Revert "Add testing of GetGenericInstance() in client_example of Softmax"

This reverts commit f629cd9a93ce38dfed4886d849f3c38d2e5379c8.

* Revert "Add GetGenericInstance() interface for DeviceOperationInstanceFactory class of DeviceSoftmax"

This reverts commit a9f0d000eb9fd240404112a526ef125429a351df.

* Support generic kernel instance to be the first instance returned by GetInstances() for GroupNorm

* Move generic kernel instance to separate tuple for elementwise op of normalization

* Remove un-used files for softmax instance

* Store generic kernel instance to separate tuple for softmax

* Add IsSupported checking for generic instance to client example of softmax

* Replace the get_device_normalize_from_mean_meansquare_instances() by the DeviceOperationInstanceFactory class for elementwise-normalization

* clang-format fix

* Remove int8 from softmax instances

---------

Co-authored-by: zjing14 <zhangjing14@gmail.com>
---
 .../gemm_add_add_layernorm_naive.cpp          |  19 +-
 client_example/06_softmax/softmax4d.cpp       |  32 ++-
 .../18_groupnorm/groupnorm_swish.cpp          |  24 +++
 .../gpu/device/device_softmax.hpp             |  17 +-
 .../gpu/device/impl/device_softmax_impl.hpp   |  17 +-
 .../gpu/device_elementwise_instance.hpp       |  41 ++--
 .../tensor_operation_instance/gpu/softmax.hpp |  89 +++++---
 .../device_softmax_f16_f16_instance.hpp       |  22 --
 ...softmax_f16_f16_instance_rank3_reduce1.hpp |   2 +-
 ...softmax_f16_f16_instance_rank3_reduce2.hpp |   2 +-
 ...softmax_f16_f16_instance_rank3_reduce3.hpp |   2 +-
 ...softmax_f16_f16_instance_rank4_reduce1.hpp |   2 +-
 ...softmax_f16_f16_instance_rank4_reduce2.hpp |   2 +-
 ...softmax_f16_f16_instance_rank4_reduce3.hpp |   2 +-
 ...softmax_f16_f16_instance_rank4_reduce4.hpp |   2 +-
 .../device_softmax_f16_f16_instance_type.hpp  |   8 +-
 .../device_softmax_f32_f32_instance.hpp       |  22 --
 ...softmax_f32_f32_instance_rank3_reduce1.hpp |   2 +-
 ...softmax_f32_f32_instance_rank3_reduce2.hpp |   2 +-
 ...softmax_f32_f32_instance_rank3_reduce3.hpp |   2 +-
 ...softmax_f32_f32_instance_rank4_reduce1.hpp |   2 +-
 ...softmax_f32_f32_instance_rank4_reduce2.hpp |   2 +-
 ...softmax_f32_f32_instance_rank4_reduce3.hpp |   2 +-
 ...softmax_f32_f32_instance_rank4_reduce4.hpp |   2 +-
 .../device_softmax_f32_f32_instance_type.hpp  |   9 +-
 .../softmax/device_softmax_i8_i8_instance.hpp |  22 --
 ...e_softmax_i8_i8_instance_rank3_reduce1.hpp |  22 --
 ...e_softmax_i8_i8_instance_rank3_reduce2.hpp |  22 --
 ...e_softmax_i8_i8_instance_rank3_reduce3.hpp |  22 --
 ...e_softmax_i8_i8_instance_rank4_reduce1.hpp |  22 --
 ...e_softmax_i8_i8_instance_rank4_reduce2.hpp |  22 --
 ...e_softmax_i8_i8_instance_rank4_reduce3.hpp |  22 --
 ...e_softmax_i8_i8_instance_rank4_reduce4.hpp |  22 --
 .../device_softmax_i8_i8_instance_type.hpp    |  40 ----
 .../gpu/softmax/device_softmax_instance.hpp   |  17 +-
 .../elementwise/device_normalize_instance.cpp |  10 +-
 .../device_groupnorm_f16_instance.cpp         |   2 +
 .../device_groupnorm_f32_instance.cpp         |   2 +
 ...oupnorm_swish_f16_f32_f32_f16_instance.cpp |   2 +
 .../device_groupnorm_swish_f16_instance.cpp   |   2 +
 .../device_groupnorm_swish_f32_instance.cpp   |   2 +
 .../device_layernorm2d_f16_instance.cpp       |   2 +
 .../device_layernorm2d_f32_instance.cpp       |   2 +
 .../device_layernorm4d_f16_instance.cpp       |   2 +
 .../device_layernorm4d_f32_instance.cpp       |   2 +
 .../normalization_instance_common.hpp         |  21 ++
 .../gpu/softmax/CMakeLists.txt                |  10 -
 .../device_softmax_f16_f16_instance.cpp       |  40 ----
 ...softmax_f16_f16_instance_rank3_reduce1.cpp |   7 +-
 ...softmax_f16_f16_instance_rank3_reduce2.cpp |   7 +-
 ...softmax_f16_f16_instance_rank3_reduce3.cpp |   7 +-
 ...softmax_f16_f16_instance_rank4_reduce1.cpp |   7 +-
 ...softmax_f16_f16_instance_rank4_reduce2.cpp |   7 +-
 ...softmax_f16_f16_instance_rank4_reduce3.cpp |   7 +-
 ...softmax_f16_f16_instance_rank4_reduce4.cpp |   7 +-
 .../device_softmax_f32_f32_instance.cpp       |  40 ----
 ...softmax_f32_f32_instance_rank3_reduce1.cpp |   7 +-
 ...softmax_f32_f32_instance_rank3_reduce2.cpp |   7 +-
 ...softmax_f32_f32_instance_rank3_reduce3.cpp |   7 +-
 ...softmax_f32_f32_instance_rank4_reduce1.cpp |   7 +-
 ...softmax_f32_f32_instance_rank4_reduce2.cpp |   7 +-
 ...softmax_f32_f32_instance_rank4_reduce3.cpp |   7 +-
 ...softmax_f32_f32_instance_rank4_reduce4.cpp |   7 +-
 .../softmax/device_softmax_i8_i8_instance.cpp |  40 ----
 ...e_softmax_i8_i8_instance_rank3_reduce1.cpp |  27 ---
 ...e_softmax_i8_i8_instance_rank3_reduce2.cpp |  27 ---
 ...e_softmax_i8_i8_instance_rank3_reduce3.cpp |  27 ---
 ...e_softmax_i8_i8_instance_rank4_reduce1.cpp |  27 ---
 ...e_softmax_i8_i8_instance_rank4_reduce2.cpp |  27 ---
 ...e_softmax_i8_i8_instance_rank4_reduce3.cpp |  27 ---
 ...e_softmax_i8_i8_instance_rank4_reduce4.cpp |  27 ---
 .../include/profiler/profile_softmax_impl.hpp |  30 ++-
 profiler/src/profile_softmax.cpp              | 191 ++++++++++++++----
 test/softmax/test_softmax_rank3.cpp           |   4 +-
 test/softmax/test_softmax_rank4.cpp           |   4 +-
 test/softmax/test_softmax_util.hpp            |  88 +++++++-
 76 files changed, 553 insertions(+), 791 deletions(-)
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp

diff --git a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
index 1129dfa6b..58c91f903 100644
--- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
@@ -172,18 +172,19 @@ int main()
             BLayout,
             CLayout>();
 
-    const auto normalize_ptrs =
-        ck::tensor_operation::device::instance::get_device_normalize_from_mean_meansquare_instances<
-            CDataType,
-            ReduceDataType,
-            ReduceDataType,
-            GammaDataType,
-            BetaDataType,
-            LayerNormOutDataType>();
-
     std::cout << "found " << gemm_reduce_ptrs.size()
               << " gemm_reduceMean_reduceSquareMean instances" << std::endl;
 
+    using NormalizeDeviceOp = ck::tensor_operation::device::DeviceElementwise<
+        ck::Tuple<CDataType, ReduceDataType, ReduceDataType, GammaDataType, BetaDataType>,
+        ck::Tuple<LayerNormOutDataType>,
+        ck::tensor_operation::element_wise::Normalize,
+        2>;
+
+    const auto normalize_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            NormalizeDeviceOp>::GetInstances();
+
     std::cout << "found " << normalize_ptrs.size() << " normalize instances" << std::endl;
 
     auto f_matrix_space_size =
diff --git a/client_example/06_softmax/softmax4d.cpp b/client_example/06_softmax/softmax4d.cpp
index 987ac9569..2ccad27a8 100644
--- a/client_example/06_softmax/softmax4d.cpp
+++ b/client_example/06_softmax/softmax4d.cpp
@@ -53,12 +53,35 @@ int main(int argc, char* argv[])
     SimpleDeviceMem in(sizeof(InDataType) * num_elements);
     SimpleDeviceMem out(sizeof(OutDataType) * num_elements);
 
-    using DeviceOp = ck::tensor_operation::device::
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+    using DeviceOp = ck::tensor_operation::device::DeviceSoftmax<InDataType,
+                                                                 AccDataType,
+                                                                 OutDataType,
+                                                                 PassThrough,
+                                                                 PassThrough,
+                                                                 Rank,
+                                                                 NumReduceDim>;
     // get device op instances
     const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
         DeviceOp>::GetInstances();
 
+    auto& generic_op_ptr = op_ptrs[0];
+
+    auto generic_argument_ptr = generic_op_ptr->MakeArgumentPointer(in_lengths,
+                                                                    in_strides,
+                                                                    reduce_dims,
+                                                                    alpha,
+                                                                    beta,
+                                                                    in.GetDeviceBuffer(),
+                                                                    out.GetDeviceBuffer(),
+                                                                    PassThrough{},
+                                                                    PassThrough{});
+
+    if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "The generic kernel instance should be able to support any input shapes");
+    };
+
     std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
 
     std::string best_op_name;
@@ -74,11 +97,6 @@ int main(int argc, char* argv[])
     {
         auto& op_ptr = op_ptrs[i];
 
-        if(op_ptr->GetRank() != Rank || op_ptr->GetNumReduceDim() != NumReduceDim)
-        {
-            continue;
-        }
-
         auto argument_ptr   = op_ptr->MakeArgumentPointer(in_lengths,
                                                         in_strides,
                                                         reduce_dims,
diff --git a/client_example/18_groupnorm/groupnorm_swish.cpp b/client_example/18_groupnorm/groupnorm_swish.cpp
index 308061a32..e1d198d22 100644
--- a/client_example/18_groupnorm/groupnorm_swish.cpp
+++ b/client_example/18_groupnorm/groupnorm_swish.cpp
@@ -72,6 +72,30 @@ int main(int argc, char* argv[])
 
     std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
 
+    const auto& generic_op_ptr = op_ptrs[0];
+
+    auto generic_argument_ptr =
+        generic_op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
+                                            xy_strides,         // xStrides
+                                            gamma_beta_strides, // gammaStrides
+                                            gamma_beta_strides, // betaStrides
+                                            xy_strides,         // yStrides
+                                            {1, 2, 4},          // reduceDims
+                                            1e-6,
+                                            x_device_buf.GetDeviceBuffer(),
+                                            gamma_device_buf.GetDeviceBuffer(),
+                                            beta_device_buf.GetDeviceBuffer(),
+                                            y_device_buf.GetDeviceBuffer(),
+                                            nullptr,
+                                            nullptr,
+                                            Swish{});
+
+    if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "The generic kernel instance should be able to support any input shapes");
+    };
+
     std::string best_op_name;
     bool found            = false;
     int best_op_id        = -1;
diff --git a/include/ck/tensor_operation/gpu/device/device_softmax.hpp b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
index a96ba89e2..1902fd09e 100644
--- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
@@ -18,7 +18,8 @@ template <typename InDataType,
           typename OutDataType,
           typename InElementwiseOp,
           typename AccElementwiseOp,
-          index_t Rank>
+          index_t Rank,
+          index_t NumReduceDim>
 struct DeviceSoftmax : public BaseOperator
 {
     //
@@ -49,8 +50,6 @@ struct DeviceSoftmax : public BaseOperator
                         AccElementwiseOp acc_elementwise_op) = 0;
 
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
-    virtual index_t GetRank() const                           = 0;
-    virtual index_t GetNumReduceDim() const                   = 0;
 };
 
 template <typename InDataType,
@@ -58,9 +57,15 @@ template <typename InDataType,
           typename OutDataType,
           typename InElementwiseOp,
           typename AccElementwiseOp,
-          index_t Rank>
-using DeviceSoftmaxPtr = std::unique_ptr<
-    DeviceSoftmax<InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank>>;
+          index_t Rank,
+          index_t NumReduceDim>
+using DeviceSoftmaxPtr = std::unique_ptr<DeviceSoftmax<InDataType,
+                                                       AccDataType,
+                                                       OutDataType,
+                                                       InElementwiseOp,
+                                                       AccElementwiseOp,
+                                                       Rank,
+                                                       NumReduceDim>>;
 
 } // namespace device
 } // namespace tensor_operation
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
index 4aa02dfd3..8eff9d241 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
@@ -38,16 +38,9 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
                                                 OutDataType,
                                                 InElementwiseOp,
                                                 AccElementwiseOp,
-                                                Rank>
+                                                Rank,
+                                                NumReduceDim>
 {
-    static constexpr index_t kRank            = Rank;
-    static constexpr index_t kNumReduceDim    = NumReduceDim;
-    static constexpr index_t kNumInvariantDim = Rank - NumReduceDim;
-
-    virtual index_t GetRank() const override { return kRank; }
-
-    virtual index_t GetNumReduceDim() const override { return kNumReduceDim; }
-
     static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
 
     static constexpr index_t NumSrcDim = Rank;
@@ -287,13 +280,13 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
     {
         if constexpr(InSrcVectorDim == 0)
         {
-            if constexpr(kNumInvariantDim == 0)
+            if constexpr(NumInvariantDim == 0)
             {
                 return false;
             }
             else
             {
-                if(arg.inStrides_[kNumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
+                if(arg.inStrides_[NumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
                 {
                     return false;
                 }
@@ -316,7 +309,7 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
         }
 
         // To improve
-        if(kNumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
+        if(NumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
         {
             return false;
         }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
index 7e6267c87..b03693b00 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
@@ -5,11 +5,10 @@
 
 #include <vector>
 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -29,20 +28,34 @@ template <typename InputType,
           typename GammaDataType,
           typename BetaDataType,
           typename OutputType>
-auto get_device_normalize_from_mean_meansquare_instances()
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceElementwise<
+    ck::Tuple<InputType, MeanType, MeanSquareType, GammaDataType, BetaDataType>,
+    ck::Tuple<OutputType>,
+    Normalize,
+    2>>
 {
-    std::vector<DeviceNormalizeFromMeanMeanSquarePtr> op_ptrs;
+    using DeviceOp = DeviceElementwise<
+        ck::Tuple<InputType, MeanType, MeanSquareType, GammaDataType, BetaDataType>,
+        ck::Tuple<OutputType>,
+        Normalize,
+        2>;
 
-    if constexpr(is_same<InputType, half_t>::value && is_same<MeanType, float>::value &&
-                 is_same<MeanSquareType, float>::value && is_same<GammaDataType, half_t>::value &&
-                 is_same<BetaDataType, half_t>::value && is_same<OutputType, half_t>::value)
+    static auto GetInstances()
     {
-        ck::tensor_operation::device::instance::
-            add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(op_ptrs);
-    }
-
-    return op_ptrs;
-}
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same<InputType, half_t>::value && is_same<MeanType, float>::value &&
+                     is_same<MeanSquareType, float>::value &&
+                     is_same<GammaDataType, half_t>::value &&
+                     is_same<BetaDataType, half_t>::value && is_same<OutputType, half_t>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(op_ptrs);
+        }
+
+        return op_ptrs;
+    };
+};
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
index c5c2d2cdd..26815f144 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
@@ -9,34 +9,33 @@
 #include "ck/ck.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
 #include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_softmax_f16_f16_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
-void add_device_softmax_f16_f16_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);
-
-void add_device_softmax_f32_f32_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
-void add_device_softmax_f32_f32_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);
-
-void add_device_softmax_i8_i8_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>&);
-void add_device_softmax_i8_i8_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>&);
-
-template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
-struct DeviceOperationInstanceFactory<
-    ck::tensor_operation::device::
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>>
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceSoftmax<InDataType,
+                                                                                  AccDataType,
+                                                                                  OutDataType,
+                                                                                  PassThrough,
+                                                                                  PassThrough,
+                                                                                  Rank,
+                                                                                  NumReduceDim>>
 {
-    using DeviceOp =
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+    using DeviceOp = DeviceSoftmax<InDataType,
+                                   AccDataType,
+                                   OutDataType,
+                                   PassThrough,
+                                   PassThrough,
+                                   Rank,
+                                   NumReduceDim>;
 
     static auto GetInstances()
     {
@@ -46,25 +45,49 @@ struct DeviceOperationInstanceFactory<
                      std::is_same_v<OutDataType, F16>)
         {
             if constexpr(Rank == 3)
-                add_device_softmax_f16_f16_rank3_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_f16_f16_rank3_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_f16_f16_rank3_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_f16_f16_rank3_reduce3_instances(op_ptrs);
+            }
             else if constexpr(Rank == 4)
-                add_device_softmax_f16_f16_rank4_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_f16_f16_rank4_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_f16_f16_rank4_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_f16_f16_rank4_reduce3_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 4)
+                    add_device_softmax_f16_f16_rank4_reduce4_instances(op_ptrs);
+            }
         }
         else if constexpr(std::is_same_v<InDataType, F32> && std::is_same_v<AccDataType, F32> &&
                           std::is_same_v<OutDataType, F32>)
         {
             if constexpr(Rank == 3)
-                add_device_softmax_f32_f32_rank3_instances(op_ptrs);
-            else if constexpr(Rank == 4)
-                add_device_softmax_f32_f32_rank4_instances(op_ptrs);
-        }
-        else if constexpr(std::is_same_v<InDataType, I8> && std::is_same_v<AccDataType, F32> &&
-                          std::is_same_v<OutDataType, I8>)
-        {
-            if constexpr(Rank == 3)
-                add_device_softmax_i8_i8_rank3_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_f32_f32_rank3_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_f32_f32_rank3_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_f32_f32_rank3_reduce3_instances(op_ptrs);
+            }
             else if constexpr(Rank == 4)
-                add_device_softmax_i8_i8_rank4_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_f32_f32_rank4_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_f32_f32_rank4_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_f32_f32_rank4_reduce3_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 4)
+                    add_device_softmax_f32_f32_rank4_reduce4_instances(op_ptrs);
+            }
         }
 
         return op_ptrs;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
deleted file mode 100644
index 7c6f189cb..000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f16_f16_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
-void add_device_softmax_f16_f16_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
index 33d5cc683..3fd2bd089 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f16_f16_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
index 7668248c3..210fdc0a5 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f16_f16_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
index 20eb7bbc9..894fb034d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f16_f16_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
index e8356a929..708ef0ce1 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f16_f16_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
index b3f7d4890..6754e5cef 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f16_f16_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
index 4190f50a3..5e111176e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f16_f16_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
index b7f334490..a3cecb32f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f16_f16_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp
index 53c142f61..8c0782daa 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp
@@ -16,7 +16,6 @@ template <index_t Rank, index_t Reduce>
 using device_softmax_f16_f16_instances = std::tuple<
     // clang-format off
     //                InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-    // fallback kernel
     DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               1,              1>,
     DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               8,              8>,
     DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  4,                 64,                1,                8,              1,               8,              8>,
@@ -33,6 +32,13 @@ using device_softmax_f16_f16_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t Rank, index_t Reduce>
+using device_softmax_f16_f16_generic_instance = std::tuple<
+    // clang-format off
+    DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,        64,                  8,                  8,                1,                1,              1,               1,              1>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
deleted file mode 100644
index 41c67af7a..000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f32_f32_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
-void add_device_softmax_f32_f32_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
index 2d791ff97..4cc469025 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f32_f32_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
index eb9cc1ee2..65724d788 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f32_f32_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
index 68af443a5..13bd45598 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f32_f32_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
index 3bf8704b4..d58b424ee 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f32_f32_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
index 43e54aaca..378e45eeb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f32_f32_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
index 32c4cd74b..293df08c7 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f32_f32_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
index f8f5caddb..e503a9fec 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f32_f32_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
index a034e41a0..90c5ddc8a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
@@ -16,7 +16,7 @@ template <index_t Rank, index_t Reduce>
 using device_softmax_f32_f32_instances = std::tuple<
     // clang-format off
     //                InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               1,               1>, // fallback kernel
+    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               1,               1>,
     DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               4,               4>,
     DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  4,                 64,                1,                8,              1,               4,               4>,
     DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,                8,              1,               4,               4>,
@@ -32,6 +32,13 @@ using device_softmax_f32_f32_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t Rank, index_t Reduce>
+using device_softmax_f32_f32_generic_instance = std::tuple<
+    // clang-format off
+    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,        64,                  8,                  8,                1,                1,              1,               1,               1>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
deleted file mode 100644
index 3cd374209..000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
-void add_device_softmax_i8_i8_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
deleted file mode 100644
index f7d4dd045..000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
deleted file mode 100644
index c49dd4d85..000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
deleted file mode 100644
index 4074ee3b1..000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
deleted file mode 100644
index 479fcc92f..000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
deleted file mode 100644
index 0dd644fab..000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
deleted file mode 100644
index 50f39396a..000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
deleted file mode 100644
index defa2dbda..000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
deleted file mode 100644
index 6ff07de23..000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <tuple>
-
-#include "ck/ck.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
-#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-template <index_t Rank, index_t Reduce>
-using device_softmax_i8_i8_instances = std::tuple<
-    // clang-format off
-    //                InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-    // fallback kernel
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,               16,              1,               1,              1>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,               16,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  4,                 64,                1,               16,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               16,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               32,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               64,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               16,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               32,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               64,              1,              16,             16>,
-    // Reduction on middle dimensions
-    // InSrcVectorDim is 0 since we want to coalesce reads on M dimension
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                8,                8,              0,               1,              1>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                 32,                  8,               32,                8,              0,              16,              8>
-    // clang-format on
-    >;
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
index 206980cf1..10f99acb8 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
@@ -3,6 +3,17 @@
 
 #pragma once
 
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
index a62c9e235..f2a5f0728 100644
--- a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
@@ -30,7 +30,12 @@ using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std:
     //###################|<in, mean, square_mean, gamma, beta>| <out>|  functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>|
     DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   8,       Sequence<8, 1, 1, 8, 8>,      Sequence<8>                >,
     DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   4,       Sequence<4, 1, 1, 4, 4>,      Sequence<4>                >,
-    DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   2,       Sequence<2, 1, 1, 2, 2>,      Sequence<2>                >,
+    DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   2,       Sequence<2, 1, 1, 2, 2>,      Sequence<2>                >
+    // clang-format on
+    >;
+
+using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_generic_instance = std::tuple<
+    // clang-format off
     DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   1,       Sequence<1, 1, 1, 1, 1>,      Sequence<1>                >
     // clang-format on
     >;
@@ -39,6 +44,9 @@ void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
     std::vector<DeviceElementwisePtr<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2>>&
         instances)
 {
+    add_device_operation_instances(
+        instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_generic_instance{});
+
     add_device_operation_instances(
         instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances{});
 }
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
index be860f58e..e3820462c 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_f16_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 5, 3>>>&
         instances)
 {
+    add_device_operation_instances(instances,
+                                   device_normalization_f16_generic_instance<Pass, 5, 3>{});
     add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 5, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
index 9a64e555d..d85817aad 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_f32_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 5, 3>>>&
         instances)
 {
+    add_device_operation_instances(instances,
+                                   device_normalization_f32_generic_instance<Pass, 5, 3>{});
     add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 5, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
index fe72a2733..a81f776c0 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F16, F32, F32, F32, F16, Swish, 5, 3>>>&
         instances)
 {
+    add_device_operation_instances(
+        instances, device_normalization_f16_f32_f32_f16_generic_instance<Swish, 5, 3>{});
     add_device_operation_instances(instances,
                                    device_normalization_f16_f32_f32_f16_instances<Swish, 5, 3>{});
 }
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
index cac8641e1..f4bb8bda8 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Swish, 5, 3>>>&
         instances)
 {
+    add_device_operation_instances(instances,
+                                   device_normalization_f16_generic_instance<Swish, 5, 3>{});
     add_device_operation_instances(instances, device_normalization_f16_instances<Swish, 5, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
index 0a9ac8462..bbb9bd0fe 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f32_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Swish, 5, 3>>>&
         instances)
 {
+    add_device_operation_instances(instances,
+                                   device_normalization_f32_generic_instance<Swish, 5, 3>{});
     add_device_operation_instances(instances, device_normalization_f32_instances<Swish, 5, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
index ad92818ec..3f7e4aff1 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
@@ -14,6 +14,8 @@ void add_device_normalization_rank_2_1_f16_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 2, 1>>>&
         instances)
 {
+    add_device_operation_instances(instances,
+                                   device_normalization_f16_generic_instance<Pass, 2, 1>{});
     add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 2, 1>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
index 70e3bbc1c..1f0db3a03 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
@@ -14,6 +14,8 @@ void add_device_normalization_rank_2_1_f32_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 2, 1>>>&
         instances)
 {
+    add_device_operation_instances(instances,
+                                   device_normalization_f32_generic_instance<Pass, 2, 1>{});
     add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 2, 1>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
index 7c5d2c4a9..cb9d72e61 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
@@ -14,6 +14,8 @@ void add_device_normalization_rank_4_3_f16_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 4, 3>>>&
         instances)
 {
+    add_device_operation_instances(instances,
+                                   device_normalization_f16_generic_instance<Pass, 4, 3>{});
     add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 4, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
index f5626d4a9..ed555b840 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
@@ -14,6 +14,8 @@ void add_device_normalization_rank_4_3_f32_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 4, 3>>>&
         instances)
 {
+    add_device_operation_instances(instances,
+                                   device_normalization_f32_generic_instance<Pass, 4, 3>{});
     add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 4, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
index d9029ac25..b0684962f 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
@@ -43,6 +43,13 @@ using device_normalization_f16_instances =
         // clang-format on
         >;
 
+template <typename OutElementwise, index_t Rank, index_t Reduce>
+using device_normalization_f16_generic_instance = std::tuple<
+    // clang-format off
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1>
+    // clang-format on
+    >;
+
 template <typename OutElementwise, index_t Rank, index_t Reduce>
 using device_normalization_f32_instances = std::tuple<
     // clang-format off
@@ -69,6 +76,13 @@ using device_normalization_f32_instances = std::tuple<
     // clang-format on
     >;
 
+template <typename OutElementwise, index_t Rank, index_t Reduce>
+using device_normalization_f32_generic_instance = std::tuple<
+    // clang-format off
+        DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1>
+    // clang-format on
+    >;
+
 template <typename OutElementwise, index_t Rank, index_t Reduce>
 using device_normalization_f16_f32_f32_f16_instances = std::tuple<
     // clang-format off
@@ -95,6 +109,13 @@ using device_normalization_f16_f32_f32_f16_instances = std::tuple<
     // clang-format on
     >;
 
+template <typename OutElementwise, index_t Rank, index_t Reduce>
+using device_normalization_f16_f32_f32_f16_generic_instance = std::tuple<
+    // clang-format off
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
index fc13261a6..202ad12b9 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
@@ -1,13 +1,4 @@
 add_instance_library(device_softmax_instance
-    device_softmax_i8_i8_instance.cpp
-    device_softmax_i8_i8_instance_rank3_reduce1.cpp
-    device_softmax_i8_i8_instance_rank3_reduce2.cpp
-    device_softmax_i8_i8_instance_rank3_reduce3.cpp
-    device_softmax_i8_i8_instance_rank4_reduce1.cpp
-    device_softmax_i8_i8_instance_rank4_reduce2.cpp
-    device_softmax_i8_i8_instance_rank4_reduce3.cpp
-    device_softmax_i8_i8_instance_rank4_reduce4.cpp
-    device_softmax_f16_f16_instance.cpp
     device_softmax_f16_f16_instance_rank3_reduce1.cpp
     device_softmax_f16_f16_instance_rank3_reduce2.cpp
     device_softmax_f16_f16_instance_rank3_reduce3.cpp
@@ -15,7 +6,6 @@ add_instance_library(device_softmax_instance
     device_softmax_f16_f16_instance_rank4_reduce2.cpp
     device_softmax_f16_f16_instance_rank4_reduce3.cpp
     device_softmax_f16_f16_instance_rank4_reduce4.cpp
-    device_softmax_f32_f32_instance.cpp
     device_softmax_f32_f32_instance_rank3_reduce1.cpp
     device_softmax_f32_f32_instance_rank3_reduce2.cpp
     device_softmax_f32_f32_instance_rank3_reduce3.cpp
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
deleted file mode 100644
index a86da7cc7..000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f16_f16_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances)
-{
-    add_device_softmax_f16_f16_rank3_reduce1_instances(instances);
-    add_device_softmax_f16_f16_rank3_reduce2_instances(instances);
-    add_device_softmax_f16_f16_rank3_reduce3_instances(instances);
-}
-
-void add_device_softmax_f16_f16_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances)
-{
-    add_device_softmax_f16_f16_rank4_reduce1_instances(instances);
-    add_device_softmax_f16_f16_rank4_reduce2_instances(instances);
-    add_device_softmax_f16_f16_rank4_reduce3_instances(instances);
-    add_device_softmax_f16_f16_rank4_reduce4_instances(instances);
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
index 938fb033a..36867d993 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f16_f16_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 1>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
index 3d5659381..373f33ad5 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f16_f16_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 2>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
index d701b4174..d26b92b4f 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f16_f16_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 3>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 3>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
index 2085aafc5..bbb735b6f 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f16_f16_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 1>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
index ebe4329f9..92dbe6776 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f16_f16_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 2>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
index b8fd5a1e5..354cda85d 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f16_f16_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 3>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
index 112f1940d..edb5e42c1 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f16_f16_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 4>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 4>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 4>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
deleted file mode 100644
index ab8a69eec..000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f32_f32_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances)
-{
-    add_device_softmax_f32_f32_rank3_reduce1_instances(instances);
-    add_device_softmax_f32_f32_rank3_reduce2_instances(instances);
-    add_device_softmax_f32_f32_rank3_reduce3_instances(instances);
-}
-
-void add_device_softmax_f32_f32_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances)
-{
-    add_device_softmax_f32_f32_rank4_reduce1_instances(instances);
-    add_device_softmax_f32_f32_rank4_reduce2_instances(instances);
-    add_device_softmax_f32_f32_rank4_reduce3_instances(instances);
-    add_device_softmax_f32_f32_rank4_reduce4_instances(instances);
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
index 5382fec90..566be8fc2 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f32_f32_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
index a1a143afa..f9c76e311 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f32_f32_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 2>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
index 992e0c1ec..541e0d71a 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f32_f32_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 3>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 3>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
index 2be1f45bb..95a38df28 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f32_f32_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
index a1da73aa8..a29b88891 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f32_f32_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 2>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
index b5c3b576a..0da46ea1b 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f32_f32_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 3>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
index 22a0404c0..fa217dc3f 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
@@ -13,12 +13,11 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f32_f32_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 4>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 4>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 4>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
deleted file mode 100644
index 81a2ff80c..000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances)
-{
-    add_device_softmax_i8_i8_rank3_reduce1_instances(instances);
-    add_device_softmax_i8_i8_rank3_reduce2_instances(instances);
-    add_device_softmax_i8_i8_rank3_reduce3_instances(instances);
-}
-
-void add_device_softmax_i8_i8_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances)
-{
-    add_device_softmax_i8_i8_rank4_reduce1_instances(instances);
-    add_device_softmax_i8_i8_rank4_reduce2_instances(instances);
-    add_device_softmax_i8_i8_rank4_reduce3_instances(instances);
-    add_device_softmax_i8_i8_rank4_reduce4_instances(instances);
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
deleted file mode 100644
index 3e2cf8d06..000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-static constexpr index_t RANK = 3;
-
-void add_device_softmax_i8_i8_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
deleted file mode 100644
index c8b038d50..000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-static constexpr index_t RANK = 3;
-
-void add_device_softmax_i8_i8_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
deleted file mode 100644
index 08995d99e..000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-static constexpr index_t RANK = 3;
-
-void add_device_softmax_i8_i8_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
deleted file mode 100644
index 652601ee7..000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-static constexpr index_t RANK = 4;
-
-void add_device_softmax_i8_i8_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
deleted file mode 100644
index 86caac1b6..000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-static constexpr index_t RANK = 4;
-
-void add_device_softmax_i8_i8_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
deleted file mode 100644
index c46ae1a4e..000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-static constexpr index_t RANK = 4;
-
-void add_device_softmax_i8_i8_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
deleted file mode 100644
index 394814ff5..000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-static constexpr index_t RANK = 4;
-
-void add_device_softmax_i8_i8_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 4>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/profiler/include/profiler/profile_softmax_impl.hpp b/profiler/include/profiler/profile_softmax_impl.hpp
index 65b4be2a6..daaf56514 100644
--- a/profiler/include/profiler/profile_softmax_impl.hpp
+++ b/profiler/include/profiler/profile_softmax_impl.hpp
@@ -40,7 +40,11 @@ template <> std::string type_to_string<int8_t>()  { return "int8"; }
 template <> std::string type_to_string<int32_t>() { return "int32"; }
 // clang-format on
 
-template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
+          index_t NumReduceDim>
 bool profile_softmax_impl(int do_verification,
                           int init_method,
                           bool do_log,
@@ -54,7 +58,13 @@ bool profile_softmax_impl(int do_verification,
     if(Rank != in_length.size())
     {
         throw std::runtime_error("Input tensor rank is different from template argument Rank!");
-    }
+    };
+
+    if(NumReduceDim != reduce_dims.size())
+    {
+        throw std::runtime_error(
+            "Input reduce_dims rank is different from template argument NumReduceDim!");
+    };
 
     Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
                                                : Tensor<InDataType>(in_length, in_strides);
@@ -92,8 +102,13 @@ bool profile_softmax_impl(int do_verification,
 
     // add device softmax instances
     using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-    using DeviceOp    = tensor_operation::device::
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+    using DeviceOp    = tensor_operation::device::DeviceSoftmax<InDataType,
+                                                             AccDataType,
+                                                             OutDataType,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             Rank,
+                                                             NumReduceDim>;
 
     // get device op instances
     const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -112,13 +127,6 @@ bool profile_softmax_impl(int do_verification,
 
     for(auto& inst_ptr : instances)
     {
-        // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
-        // problem to rank 4 kernel) other than invoking IsSupportedArgument()?
-        if(!(inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
-        {
-            continue;
-        }
-
         auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
                                                           in_tensor_strides,
                                                           reduce_dims,
diff --git a/profiler/src/profile_softmax.cpp b/profiler/src/profile_softmax.cpp
index 77007ad13..dfe8d95c9 100644
--- a/profiler/src/profile_softmax.cpp
+++ b/profiler/src/profile_softmax.cpp
@@ -92,27 +92,76 @@ int profile_softmax(int argc, char* argv[])
     {
         if(data_type == SoftmaxDataType::F16_F16)
         {
-            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification,
-                                                                                 init_method,
-                                                                                 do_log,
-                                                                                 time_kernel,
-                                                                                 length,
-                                                                                 stride,
-                                                                                 reduce,
-                                                                                 double(alpha),
-                                                                                 double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 1>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 2>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 3>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
         }
         else if(data_type == SoftmaxDataType::F32_F32)
         {
-            ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification,
-                                                                       init_method,
-                                                                       do_log,
-                                                                       time_kernel,
-                                                                       length,
-                                                                       stride,
-                                                                       reduce,
-                                                                       double(alpha),
-                                                                       double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 1>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 2>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 3>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
         }
         else
         {
@@ -124,27 +173,97 @@ int profile_softmax(int argc, char* argv[])
     {
         if(data_type == SoftmaxDataType::F16_F16)
         {
-            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification,
-                                                                                 init_method,
-                                                                                 do_log,
-                                                                                 time_kernel,
-                                                                                 length,
-                                                                                 stride,
-                                                                                 reduce,
-                                                                                 double(alpha),
-                                                                                 double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 1>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 2>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 3>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 4)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 4>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
         }
         else if(data_type == SoftmaxDataType::F32_F32)
         {
-            ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification,
-                                                                       init_method,
-                                                                       do_log,
-                                                                       time_kernel,
-                                                                       length,
-                                                                       stride,
-                                                                       reduce,
-                                                                       double(alpha),
-                                                                       double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 1>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 2>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 3>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 4)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 4>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
         }
         else
         {
diff --git a/test/softmax/test_softmax_rank3.cpp b/test/softmax/test_softmax_rank3.cpp
index 24ad912d8..43ae11bf1 100644
--- a/test/softmax/test_softmax_rank3.cpp
+++ b/test/softmax/test_softmax_rank3.cpp
@@ -13,7 +13,6 @@ using I = ck::Number<N>;
 
 using F16 = ck::half_t;
 using F32 = float;
-using I8  = int8_t;
 
 template <typename Tuple>
 class TestSoftmax : public ck::TestSoftmax<Tuple>
@@ -24,8 +23,7 @@ class TestSoftmax : public ck::TestSoftmax<Tuple>
 using KernelTypes = ::testing::Types<
     //         InDataType, AccDataType, OutDataType, Rank
     std::tuple<       F16,         F32,         F16,    I<3>>,
-    std::tuple<       F32,         F32,         F32,    I<3>>,
-    std::tuple<        I8,         F32,          I8,    I<3>>
+    std::tuple<       F32,         F32,         F32,    I<3>>
     >;
 // clang-format on
 
diff --git a/test/softmax/test_softmax_rank4.cpp b/test/softmax/test_softmax_rank4.cpp
index b58301fb1..5cf96bbaa 100644
--- a/test/softmax/test_softmax_rank4.cpp
+++ b/test/softmax/test_softmax_rank4.cpp
@@ -13,7 +13,6 @@ using I = ck::Number<N>;
 
 using F16 = ck::half_t;
 using F32 = float;
-using I8  = int8_t;
 
 template <typename Tuple>
 class TestSoftmax : public ck::TestSoftmax<Tuple>
@@ -24,8 +23,7 @@ class TestSoftmax : public ck::TestSoftmax<Tuple>
 using KernelTypes = ::testing::Types<
     //         InDataType, AccDataType, OutDataType, Rank
     std::tuple<       F16,         F32,         F16,    I<4>>,
-    std::tuple<       F32,         F32,         F32,    I<4>>,
-    std::tuple<        I8,         F32,          I8,    I<4>>
+    std::tuple<       F32,         F32,         F32,    I<4>>
     >;
 // clang-format on
 
diff --git a/test/softmax/test_softmax_util.hpp b/test/softmax/test_softmax_util.hpp
index e36231de8..1409af845 100644
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
@@ -61,8 +61,92 @@ class TestSoftmax : public ::testing::Test
         int init_method = 1; // integer value initialization
         bool log        = false;
         std::vector<ck::index_t> strides; // intenionally empty, to get packed layout.
-        bool pass = ck::profiler::profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank>(
-            verify_, init_method, log, bench_, in_length, strides, reduce_dims, alpha, beta);
+        bool pass = false;
+
+        if constexpr(Rank == 3)
+        {
+            if(reduce_dims.size() == 1)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+            else if(reduce_dims.size() == 2)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+            else if(reduce_dims.size() == 3)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+        }
+        else if constexpr(Rank == 4)
+        {
+            if(reduce_dims.size() == 1)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+            else if(reduce_dims.size() == 2)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+            else if(reduce_dims.size() == 3)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+            else if(reduce_dims.size() == 4)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 4>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+        };
+
         EXPECT_TRUE(pass);
     }
 
-- 
GitLab


From 341ad956657a0ad3501af0dcceeddb5018449de6 Mon Sep 17 00:00:00 2001
From: rocking <ChunYu.Lai@amd.com>
Date: Mon, 19 Jun 2023 22:44:22 +0800
Subject: [PATCH 65/71] Maxpool bwd (#750)

* Add maxpool f32 kernel and example

* Revise copyright

* Add device pool bwd device op

* Support f16 and bf16

* Add compute datatype for reference code.
Prevent error in bf16

* Fix type error

* Remove layout

* Fix bf16 error

* Add f16 and bf16 example

* Add more operations

* Implement IsSupportedArgument

* Add changelog

* Add comment

* Add comment

* Remove useless header

* Move initialize of workspace to the run

* Move set din zero to the device operator

* Save din_length_raw

* Remove useless header

* Calculate gridsize according to the number of CU

* Calculate gridSize according to the number of CU.
Remove useless header

* Add put example

* Remove useless header

* Fix CI fail
---
 CHANGELOG.md                                  |   2 +
 example/49_maxpool2d_bwd/CMakeLists.txt       |   3 +
 .../49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp   |  62 ++++
 .../49_maxpool2d_bwd/maxpool2d_bwd_common.hpp | 222 ++++++++++++
 .../49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp   |  62 ++++
 .../49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp   |  62 ++++
 example/50_put_element/CMakeLists.txt         |   1 +
 example/50_put_element/put_element_fp16.cpp   |  88 +++++
 include/ck/host_utility/stream_utility.hpp    |   2 +-
 .../gpu/device/device_index_pool_bwd.hpp      |  32 ++
 .../gpu/device/device_put_element.hpp         |  36 ++
 .../impl/device_index_pool_bwd_impl.hpp       | 316 ++++++++++++++++++
 .../device/impl/device_put_element_impl.hpp   | 155 +++++++++
 .../gpu/grid/gridwise_put_element_1d.hpp      | 155 +++++++++
 .../cpu/reference_maxpool_bwd.hpp             | 103 ++++++
 .../cpu/reference_pool_fwd.hpp                |  20 +-
 16 files changed, 1310 insertions(+), 11 deletions(-)
 create mode 100644 example/49_maxpool2d_bwd/CMakeLists.txt
 create mode 100644 example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
 create mode 100644 example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
 create mode 100644 example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
 create mode 100644 example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
 create mode 100644 example/50_put_element/CMakeLists.txt
 create mode 100644 example/50_put_element/put_element_fp16.cpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_put_element.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 018835004..3898b5ce2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,8 @@ Full documentation for Composable Kernel is not yet available.
 - Added multi-embeddings support (#542).
 - Added Navi3x blockwise GEMM and real GEMM support (#541).
 - Added Navi grouped ConvBwdWeight support (#505).
+- Added pool3d forward (#697).
+- Added maxpool backward (#750).
 
 ### Changed
 - Changed ...
diff --git a/example/49_maxpool2d_bwd/CMakeLists.txt b/example/49_maxpool2d_bwd/CMakeLists.txt
new file mode 100644
index 000000000..b29cf9ccb
--- /dev/null
+++ b/example/49_maxpool2d_bwd/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_example_executable(example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp)
+add_example_executable(example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp)
+add_example_executable(example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp)
diff --git a/example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp b/example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
new file mode 100644
index 000000000..08a8009c6
--- /dev/null
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "maxpool2d_bwd_common.hpp"
+
+using InDataType      = ck::bhalf_t;
+using OutDataType     = ck::bhalf_t;
+using IndexDataType   = int32_t;
+using ComputeDataType = float;
+using DInDataType     = ck::bhalf_t;
+using DOutDataType    = ck::bhalf_t;
+
+static constexpr bool PropagateNan = false;
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // Pool shape
+    ck::index_t N               = 1;
+    ck::index_t C               = 1;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 32;
+    ck::index_t Wi              = 32;
+    ck::index_t window_stride_h = 1;
+    ck::index_t window_stride_w = 1;
+    ck::index_t in_left_pad_h   = 0;
+    ck::index_t in_left_pad_w   = 0;
+    ck::index_t in_right_pad_h  = 0;
+    ck::index_t in_right_pad_w  = 0;
+
+    bool pass = maxpool_bwd_test<InDataType,
+                                 OutDataType,
+                                 IndexDataType,
+                                 ComputeDataType,
+                                 DInDataType,
+                                 DOutDataType,
+                                 PropagateNan>(do_verification,
+                                               time_kernel,
+                                               N,
+                                               C,
+                                               Y,
+                                               X,
+                                               Hi,
+                                               Wi,
+                                               window_stride_h,
+                                               window_stride_w,
+                                               in_left_pad_h,
+                                               in_left_pad_w,
+                                               in_right_pad_h,
+                                               in_right_pad_w);
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp b/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
new file mode 100644
index 000000000..045793cc2
--- /dev/null
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp"
+
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          typename ComputeDataType,
+          typename DInDataType,
+          typename DOutDataType,
+          bool PropagateNan>
+bool maxpool_bwd_test(bool do_verification,
+                      bool time_kernel,
+                      ck::index_t N,
+                      ck::index_t C,
+                      ck::index_t Y,
+                      ck::index_t X,
+                      ck::index_t Hi,
+                      ck::index_t Wi,
+                      ck::index_t window_stride_h,
+                      ck::index_t window_stride_w,
+                      ck::index_t in_left_pad_h,
+                      ck::index_t in_left_pad_w,
+                      ck::index_t in_right_pad_h,
+                      ck::index_t in_right_pad_w)
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using DevicePoolFwdInstance =
+        ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
+            InDataType,      // InDataType
+            OutDataType,     // OutDataType
+            IndexDataType,   // IndexDataType
+            ComputeDataType, // ComputeDataType
+            ck::ReduceTensorOp::MAX,
+            true, // OutputIndex
+            64,   // BlockSize
+            64,   // ReduceMThreadClusterSize
+            1,    // ReduceKThreadClusterSize
+            4,    // ReduceMThreadSliceSize
+            1,    // ReduceKThreadSliceSize
+            1>;   // InSrcOutDstVectorSize
+
+    using DeviceMaxPoolBwdInstance = ck::tensor_operation::device::
+        DeviceIndexPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 4>;
+
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+
+    const std::vector<ck::index_t> window_spatial_lengths{Y, X};
+    const std::vector<ck::index_t> window_strides{window_stride_h, window_stride_w};
+    const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
+    const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
+            using namespace ck::literals;
+            // reference need Tensor with NCHW order
+            return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+        };
+
+    // in
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
+
+    // out
+    Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
+
+    // indices
+    Tensor<IndexDataType> indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<IndexDataType> indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
+
+    // dout
+    Tensor<DOutDataType> dout_n_c_ho_wo(f_host_tensor_descriptor(N, C, Ho, Wo));
+
+    // din
+    Tensor<DInDataType> din_n_c_hi_wi_host(f_host_tensor_descriptor(N, C, Hi, Wi));
+    Tensor<DInDataType> din_n_c_hi_wi_device(f_host_tensor_descriptor(N, C, Hi, Wi));
+
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl;
+    std::cout << "indices_n_c_ho_wo: " << indices_n_c_ho_wo_host.mDesc << std::endl;
+    std::cout << "dout_n_c_ho_wo: " << dout_n_c_ho_wo.mDesc << std::endl;
+    std::cout << "din_n_c_hi_wi: " << din_n_c_hi_wi_host.mDesc << std::endl;
+
+    in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+    dout_n_c_ho_wo.GenerateTensorValue(GeneratorTensor_3<DOutDataType>{-1.0, 1.0});
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem indices_device_buf(sizeof(IndexDataType) *
+                                 indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem dout_device_buf(sizeof(DOutDataType) * dout_n_c_ho_wo.mDesc.GetElementSpaceSize());
+    DeviceMem din_device_buf(sizeof(DInDataType) *
+                             din_n_c_hi_wi_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    dout_device_buf.ToDevice(dout_n_c_ho_wo.mData.data());
+
+    auto pool_fwd              = DevicePoolFwdInstance{};
+    auto pool_fwd_invoker_ptr  = pool_fwd.MakeInvokerPointer();
+    auto pool_fwd_argument_ptr = pool_fwd.MakeArgumentPointer(
+        static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+        static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
+        {N, C, Hi, Wi},
+        window_spatial_lengths,
+        {N, C, Ho, Wo},
+        {C * Hi * Wi, 1, Wi * C, C},
+        {C * Ho * Wo, 1, Wo * C, C},
+        {C * Ho * Wo, 1, Wo * C, C},
+        window_strides,
+        input_left_pads,
+        input_right_pads,
+        {2, 3});
+
+    if(!pool_fwd.IsSupportedArgument(pool_fwd_argument_ptr.get()))
+    {
+        throw std::runtime_error("wrong! pool_fwd with the specified compilation parameters does "
+                                 "not support this problem");
+    }
+
+    float ave_time_fwd =
+        pool_fwd_invoker_ptr->Run(pool_fwd_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    auto pool_bwd              = DeviceMaxPoolBwdInstance{};
+    auto pool_bwd_invoker_ptr  = pool_bwd.MakeInvokerPointer();
+    auto pool_bwd_argument_ptr = pool_bwd.MakeArgumentPointer(
+        static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
+        static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
+        static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
+        dout_n_c_ho_wo.mDesc.GetElementSpaceSize(),
+        din_n_c_hi_wi_device.mDesc.GetElementSpaceSize(),
+        window_spatial_lengths,
+        window_strides);
+
+    if(!pool_bwd.IsSupportedArgument(pool_bwd_argument_ptr.get()))
+    {
+        throw std::runtime_error("wrong! pool_bwd with the specified compilation parameters does "
+                                 "not support this problem");
+    }
+
+    size_t pool_bwd_workspace_sz = pool_bwd.GetWorkSpaceSize(pool_bwd_argument_ptr.get());
+    DeviceMem pool_bwd_workspace_device_buf(pool_bwd_workspace_sz);
+    pool_bwd.SetWorkSpacePointer(pool_bwd_argument_ptr.get(),
+                                 pool_bwd_workspace_device_buf.GetDeviceBuffer());
+
+    float ave_time_bwd =
+        pool_bwd_invoker_ptr->Run(pool_bwd_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    std::cout << "Pool fwd perf: " << ave_time_fwd << " ms" << std::endl;
+    std::cout << "Pool bwd perf: " << ave_time_bwd << " ms" << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        using ReferencePoolingFwdInstance =
+            ck::tensor_operation::host::ReferencePoolingFwd<4,
+                                                            2,
+                                                            InDataType,
+                                                            OutDataType,
+                                                            ComputeDataType,
+                                                            IndexDataType,
+                                                            ck::ReduceTensorOp::MAX,
+                                                            PropagateNan,
+                                                            true>;
+
+        auto ref_pooling_fwd          = ReferencePoolingFwdInstance{};
+        auto ref_pooling_fwd_invoker  = ref_pooling_fwd.MakeInvoker();
+        auto ref_pooling_fwd_argument = ref_pooling_fwd.MakeArgument(in_n_c_hi_wi,
+                                                                     out_n_c_ho_wo_host,
+                                                                     indices_n_c_ho_wo_host,
+                                                                     window_spatial_lengths,
+                                                                     window_strides,
+                                                                     input_left_pads,
+                                                                     input_right_pads);
+        ref_pooling_fwd_invoker.Run(ref_pooling_fwd_argument);
+
+        using ReferencePoolingBwdInstance =
+            ck::tensor_operation::host::ReferenceMaxPoolBwd<DOutDataType,
+                                                            IndexDataType,
+                                                            ComputeDataType,
+                                                            DInDataType,
+                                                            PassThrough>;
+
+        auto ref_pooling_bwd          = ReferencePoolingBwdInstance{};
+        auto ref_pooling_bwd_invoker  = ref_pooling_bwd.MakeInvoker();
+        auto ref_pooling_bwd_argument = ref_pooling_bwd.MakeArgument(
+            dout_n_c_ho_wo, indices_n_c_ho_wo_host, din_n_c_hi_wi_host, PassThrough{});
+
+        ref_pooling_bwd_invoker.Run(ref_pooling_bwd_argument);
+
+        out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
+        indices_device_buf.FromDevice(indices_n_c_ho_wo_device.mData.data());
+        din_device_buf.FromDevice(din_n_c_hi_wi_device.mData.data());
+
+        pass = pass && ck::utils::check_err(out_n_c_ho_wo_device, out_n_c_ho_wo_host);
+        pass = pass && ck::utils::check_err(indices_n_c_ho_wo_device, indices_n_c_ho_wo_host);
+        pass = pass && ck::utils::check_err(din_n_c_hi_wi_device, din_n_c_hi_wi_host);
+    }
+
+    return (pass);
+};
diff --git a/example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp b/example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
new file mode 100644
index 000000000..c6b0b2c86
--- /dev/null
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "maxpool2d_bwd_common.hpp"
+
+using InDataType      = ck::half_t;
+using OutDataType     = ck::half_t;
+using IndexDataType   = int32_t;
+using ComputeDataType = float;
+using DInDataType     = ck::half_t;
+using DOutDataType    = ck::half_t;
+
+static constexpr bool PropagateNan = false;
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // Pool shape
+    ck::index_t N               = 1;
+    ck::index_t C               = 1;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 32;
+    ck::index_t Wi              = 32;
+    ck::index_t window_stride_h = 1;
+    ck::index_t window_stride_w = 1;
+    ck::index_t in_left_pad_h   = 0;
+    ck::index_t in_left_pad_w   = 0;
+    ck::index_t in_right_pad_h  = 0;
+    ck::index_t in_right_pad_w  = 0;
+
+    bool pass = maxpool_bwd_test<InDataType,
+                                 OutDataType,
+                                 IndexDataType,
+                                 ComputeDataType,
+                                 DInDataType,
+                                 DOutDataType,
+                                 PropagateNan>(do_verification,
+                                               time_kernel,
+                                               N,
+                                               C,
+                                               Y,
+                                               X,
+                                               Hi,
+                                               Wi,
+                                               window_stride_h,
+                                               window_stride_w,
+                                               in_left_pad_h,
+                                               in_left_pad_w,
+                                               in_right_pad_h,
+                                               in_right_pad_w);
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp b/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
new file mode 100644
index 000000000..c79b84c48
--- /dev/null
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "maxpool2d_bwd_common.hpp"
+
+using InDataType      = float;
+using OutDataType     = float;
+using IndexDataType   = int32_t;
+using ComputeDataType = float;
+using DInDataType     = float;
+using DOutDataType    = float;
+
+static constexpr bool PropagateNan = false;
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // Pool shape
+    ck::index_t N               = 1;
+    ck::index_t C               = 1;
+    ck::index_t Y               = 2;
+    ck::index_t X               = 2;
+    ck::index_t Hi              = 32;
+    ck::index_t Wi              = 32;
+    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_w = 2;
+    ck::index_t in_left_pad_h   = 0;
+    ck::index_t in_left_pad_w   = 0;
+    ck::index_t in_right_pad_h  = 0;
+    ck::index_t in_right_pad_w  = 0;
+
+    bool pass = maxpool_bwd_test<InDataType,
+                                 OutDataType,
+                                 IndexDataType,
+                                 ComputeDataType,
+                                 DInDataType,
+                                 DOutDataType,
+                                 PropagateNan>(do_verification,
+                                               time_kernel,
+                                               N,
+                                               C,
+                                               Y,
+                                               X,
+                                               Hi,
+                                               Wi,
+                                               window_stride_h,
+                                               window_stride_w,
+                                               in_left_pad_h,
+                                               in_left_pad_w,
+                                               in_right_pad_h,
+                                               in_right_pad_w);
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/50_put_element/CMakeLists.txt b/example/50_put_element/CMakeLists.txt
new file mode 100644
index 000000000..1b0020ebc
--- /dev/null
+++ b/example/50_put_element/CMakeLists.txt
@@ -0,0 +1 @@
+add_example_executable(example_put_element_fp16 put_element_fp16.cpp)
diff --git a/example/50_put_element/put_element_fp16.cpp b/example/50_put_element/put_element_fp16.cpp
new file mode 100644
index 000000000..d4b6831bc
--- /dev/null
+++ b/example/50_put_element/put_element_fp16.cpp
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+using XDataType     = ck::half_t;
+using YDataType     = ck::half_t;
+using IndexDataType = int32_t;
+
+using YElementwiseOp = ck::tensor_operation::element_wise::PassThrough;
+
+using DeviceInstance =
+    ck::tensor_operation::device::DevicePutElementImpl<XDataType,     // XDataType
+                                                       IndexDataType, // IndexDataType
+                                                       YDataType,     // YDataType
+                                                       YElementwiseOp,
+                                                       ck::InMemoryDataOperationEnum::Set,
+                                                       1>;
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    int N = 1024;
+
+    Tensor<XDataType> x(HostTensorDescriptor{N, 1});
+    Tensor<IndexDataType> indices(HostTensorDescriptor{N, 1});
+    Tensor<YDataType> y(HostTensorDescriptor{N, 1});
+
+    x.GenerateTensorValue(GeneratorTensor_3<XDataType>{-1.0, 1.0});
+    for(int i = 0; i < N; ++i)
+        indices(i) = i;
+
+    DeviceMem x_device_buf(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem y_device_buf(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+    DeviceMem indices_device_buf(sizeof(IndexDataType) * indices.mDesc.GetElementSpaceSize());
+
+    x_device_buf.ToDevice(x.mData.data());
+    indices_device_buf.ToDevice(indices.mData.data());
+
+    auto put_instance     = DeviceInstance{};
+    auto put_invoker_ptr  = put_instance.MakeInvokerPointer();
+    auto put_argument_ptr = put_instance.MakeArgumentPointer(
+        static_cast<XDataType*>(x_device_buf.GetDeviceBuffer()),
+        static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
+        static_cast<YDataType*>(y_device_buf.GetDeviceBuffer()),
+        N,
+        N,
+        YElementwiseOp{});
+
+    if(!put_instance.IsSupportedArgument(put_argument_ptr.get()))
+    {
+        throw std::runtime_error("argument is not supported!");
+    }
+
+    float ave_time =
+        put_invoker_ptr->Run(put_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    std::cout << "perf: " << ave_time << " ms" << std::endl;
+
+    bool pass = true;
+    if(do_verification)
+    {
+        Tensor<YDataType> y_host(HostTensorDescriptor{N, 1});
+
+        for(int i = 0; i < N; ++i)
+        {
+            IndexDataType idx = indices(i);
+            y_host(idx)       = x(i);
+        }
+
+        y_device_buf.FromDevice(y.mData.data());
+        pass = ck::utils::check_err(y, y_host);
+    }
+
+    return (pass ? 0 : 1);
+}
diff --git a/include/ck/host_utility/stream_utility.hpp b/include/ck/host_utility/stream_utility.hpp
index ef05f2e26..9ab49489b 100644
--- a/include/ck/host_utility/stream_utility.hpp
+++ b/include/ck/host_utility/stream_utility.hpp
@@ -8,7 +8,7 @@
 #include "ck/stream_config.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
 
-static int getAvailableComputeUnitCount(const StreamConfig& stream_config)
+static inline int getAvailableComputeUnitCount(const StreamConfig& stream_config)
 {
     constexpr int MAX_MASK_DWORDS = 64;
 
diff --git a/include/ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp b/include/ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp
new file mode 100644
index 000000000..bf81ed9f5
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// For pooling which used indexable operation, such as MaxPool, MinPool...etc
+template <typename DOutDataType, typename IndexDataType, typename DInDataType>
+struct DeviceIndexPoolBwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_dout,
+                        const void* p_indices,
+                        void* p_din,
+                        index_t dout_length,
+                        index_t din_length,
+                        std::vector<ck::index_t> window_lengths,
+                        std::vector<ck::index_t> window_strides) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_put_element.hpp b/include/ck/tensor_operation/gpu/device/device_put_element.hpp
new file mode 100644
index 000000000..918682749
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_put_element.hpp
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// output[indices] = input
+template <typename InDataType,
+          typename IndexDataType,
+          typename OutDataType,
+          typename ElementwiseOperation,
+          InMemoryDataOperationEnum Op>
+struct DevicePutElement : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_input,
+                        const void* p_indices,
+                        void* p_output,
+                        index_t input_length,
+                        index_t output_length,
+                        ElementwiseOperation elementwise_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp
new file mode 100644
index 000000000..175994d49
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/stream_utility.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// output[indices] = input
+template <typename DOutDataType,
+          typename IndexDataType,
+          typename DInDataType,
+          ck::index_t InOutVectorSize>
+struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDataType, DInDataType>
+{
+    using DInDataType_AutomicAddPreCast =
+        conditional_t<is_same_v<DInDataType, float> || is_same_v<DInDataType, double>,
+                      DInDataType,
+                      float>;
+
+    using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+    using UnaryConvert = ck::tensor_operation::element_wise::UnaryConvert;
+
+    static constexpr auto I0 = Number<0>{};
+
+    template <typename Desc_M>
+    static auto PadDescriptor_M_1d(Desc_M desc_m, index_t loop_step)
+    {
+        const auto m   = desc_m.GetLength(I0);
+        const auto pad = math::integer_least_multiple(m, loop_step) - m;
+        const auto desc_m_pad =
+            transform_tensor_descriptor(desc_m,
+                                        make_tuple(make_right_pad_transform(m, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return desc_m_pad;
+    }
+
+    static auto MakeDescriptor_M(index_t length, index_t loop_step)
+    {
+        const auto desc_m = make_naive_tensor_descriptor_packed(make_tuple(length));
+        return PadDescriptor_M_1d(desc_m, loop_step);
+    }
+
+    using InOutGrid1dDesc = decltype(MakeDescriptor_M(1, 1));
+
+    using GridwisePutElementSet = GridwisePutElement_1D<InOutGrid1dDesc,
+                                                        DOutDataType,
+                                                        IndexDataType,
+                                                        DInDataType,
+                                                        PassThrough,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        InOutVectorSize>;
+
+    using GridwisePutElementAtomicAdd = GridwisePutElement_1D<InOutGrid1dDesc,
+                                                              DOutDataType,
+                                                              IndexDataType,
+                                                              DInDataType_AutomicAddPreCast,
+                                                              PassThrough,
+                                                              InMemoryDataOperationEnum::AtomicAdd,
+                                                              InOutVectorSize>;
+
+    using GridwiseCasting = GridwiseElementwise_1D<Tuple<InOutGrid1dDesc>,
+                                                   Tuple<InOutGrid1dDesc>,
+                                                   Tuple<const DInDataType_AutomicAddPreCast*>,
+                                                   Tuple<DInDataType*>,
+                                                   UnaryConvert,
+                                                   InOutVectorSize,
+                                                   Sequence<InOutVectorSize>,
+                                                   Sequence<InOutVectorSize>>;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const DOutDataType* p_dout,
+                 const IndexDataType* p_indices,
+                 DInDataType* p_din,
+                 index_t dout_length,
+                 index_t din_length,
+                 const std::vector<ck::index_t>& window_lengths,
+                 const std::vector<ck::index_t>& window_strides)
+            : p_dout_{p_dout},
+              p_indices_{p_indices},
+              p_din_{p_din},
+              dout_length_raw_{dout_length},
+              din_length_raw_{din_length},
+              blockSize_{256},
+              windowOverlap_{false}
+        {
+            for(size_t i = 0; i < window_lengths.size(); ++i)
+            {
+                windowOverlap_ |= window_lengths.at(i) > window_strides.at(i);
+            }
+        }
+
+        const DOutDataType* p_dout_;
+        const IndexDataType* p_indices_;
+        DInDataType* p_din_;
+        index_t dout_length_raw_;
+        index_t din_length_raw_;
+        index_t blockSize_;
+        bool windowOverlap_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            index_t gridSize               = getAvailableComputeUnitCount(stream_config);
+            index_t loop_step              = gridSize * arg.blockSize_ * InOutVectorSize;
+            InOutGrid1dDesc din_grid_desc  = MakeDescriptor_M(arg.din_length_raw_, loop_step);
+            InOutGrid1dDesc dout_grid_desc = MakeDescriptor_M(arg.dout_length_raw_, loop_step);
+
+            if constexpr(is_same_v<DInDataType, float> || is_same_v<DInDataType, double>)
+            {
+                hip_check_error(hipMemsetAsync(arg.p_din_,
+                                               0,
+                                               arg.din_length_raw_ * sizeof(DInDataType),
+                                               stream_config.stream_id_));
+
+                if(arg.windowOverlap_)
+                {
+                    const auto put_kernel = kernel_put_element_1d<GridwisePutElementAtomicAdd,
+                                                                  InOutGrid1dDesc,
+                                                                  DOutDataType,
+                                                                  IndexDataType,
+                                                                  DInDataType,
+                                                                  PassThrough>;
+
+                    return launch_and_time_kernel(stream_config,
+                                                  put_kernel,
+                                                  dim3(gridSize),
+                                                  dim3(arg.blockSize_),
+                                                  0,
+                                                  dout_grid_desc,
+                                                  arg.p_dout_,
+                                                  arg.p_indices_,
+                                                  arg.p_din_,
+                                                  PassThrough{});
+                }
+                else
+                {
+                    const auto put_kernel = kernel_put_element_1d<GridwisePutElementSet,
+                                                                  InOutGrid1dDesc,
+                                                                  DOutDataType,
+                                                                  IndexDataType,
+                                                                  DInDataType,
+                                                                  PassThrough>;
+
+                    return launch_and_time_kernel(stream_config,
+                                                  put_kernel,
+                                                  dim3(gridSize),
+                                                  dim3(arg.blockSize_),
+                                                  0,
+                                                  dout_grid_desc,
+                                                  arg.p_dout_,
+                                                  arg.p_indices_,
+                                                  arg.p_din_,
+                                                  PassThrough{});
+                }
+            }
+            else
+            {
+                if(arg.windowOverlap_)
+                {
+                    if(arg.p_workspace_ == nullptr)
+                        throw std::runtime_error("wrong! WorkSpace pointer has not been set");
+
+                    hip_check_error(
+                        hipMemsetAsync(arg.p_workspace_,
+                                       0,
+                                       arg.din_length_raw_ * sizeof(DInDataType_AutomicAddPreCast),
+                                       stream_config.stream_id_));
+
+                    const auto put_kernel = kernel_put_element_1d<GridwisePutElementAtomicAdd,
+                                                                  InOutGrid1dDesc,
+                                                                  DOutDataType,
+                                                                  IndexDataType,
+                                                                  DInDataType_AutomicAddPreCast,
+                                                                  PassThrough>;
+
+                    const auto cast_kernel =
+                        kernel_elementwise_1d<GridwiseCasting,
+                                              Tuple<InOutGrid1dDesc>,
+                                              Tuple<InOutGrid1dDesc>,
+                                              Tuple<const DInDataType_AutomicAddPreCast*>,
+                                              Tuple<DInDataType*>,
+                                              UnaryConvert>;
+
+                    float elapsed_time = launch_and_time_kernel(
+                        stream_config,
+                        put_kernel,
+                        dim3(gridSize),
+                        dim3(arg.blockSize_),
+                        0,
+                        dout_grid_desc,
+                        arg.p_dout_,
+                        arg.p_indices_,
+                        static_cast<DInDataType_AutomicAddPreCast*>(arg.p_workspace_),
+                        PassThrough{});
+
+                    elapsed_time += launch_and_time_kernel(
+                        stream_config,
+                        cast_kernel,
+                        dim3(gridSize),
+                        dim3(arg.blockSize_),
+                        0,
+                        ck::make_tuple(din_grid_desc),
+                        ck::make_tuple(din_grid_desc),
+                        static_cast<DInDataType_AutomicAddPreCast*>(arg.p_workspace_),
+                        arg.p_din_,
+                        UnaryConvert{});
+
+                    return elapsed_time;
+                }
+                else
+                {
+                    const auto put_kernel = kernel_put_element_1d<GridwisePutElementSet,
+                                                                  InOutGrid1dDesc,
+                                                                  DOutDataType,
+                                                                  IndexDataType,
+                                                                  DInDataType,
+                                                                  PassThrough>;
+
+                    hip_check_error(hipMemsetAsync(arg.p_din_,
+                                                   0,
+                                                   arg.din_length_raw_ * sizeof(DInDataType),
+                                                   stream_config.stream_id_));
+
+                    return launch_and_time_kernel(stream_config,
+                                                  put_kernel,
+                                                  dim3(gridSize),
+                                                  dim3(arg.blockSize_),
+                                                  0,
+                                                  dout_grid_desc,
+                                                  arg.p_dout_,
+                                                  arg.p_indices_,
+                                                  arg.p_din_,
+                                                  PassThrough{});
+                }
+            }
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
+    {
+        const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
+
+        bool needCast = pArg_->windowOverlap_ &&
+                        !(is_same_v<DInDataType, float> || is_same_v<DInDataType, double>);
+
+        if(!needCast)
+            return 0;
+        else
+            return pArg_->din_length_raw_ * sizeof(DInDataType_AutomicAddPreCast);
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+        if(pArg->din_length_raw_ % InOutVectorSize != 0 ||
+           pArg->dout_length_raw_ % InOutVectorSize != 0)
+        {
+            return false;
+        }
+        return true;
+    }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_dout,
+                        const void* p_indices,
+                        void* p_din,
+                        index_t dout_length,
+                        index_t din_length,
+                        std::vector<ck::index_t> window_lengths,
+                        std::vector<ck::index_t> window_strides) override
+    {
+        // Assume p_dout, p_indices, p_din are packed memory space, dout_length and din_length are
+        // physical size of the packed tensor
+        return std::make_unique<Argument>(static_cast<const DOutDataType*>(p_dout),
+                                          static_cast<const IndexDataType*>(p_indices),
+                                          static_cast<DInDataType*>(p_din),
+                                          dout_length,
+                                          din_length,
+                                          window_lengths,
+                                          window_strides);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp
new file mode 100644
index 000000000..7334da0e3
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/device_put_element.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/stream_utility.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// output[indices] = input
+template <typename InDataType,
+          typename IndexDataType,
+          typename OutDataType,
+          typename ElementwiseOperation,
+          InMemoryDataOperationEnum MemOp,
+          ck::index_t InVectorSize>
+struct DevicePutElementImpl
+    : public DevicePutElement<InDataType, IndexDataType, OutDataType, ElementwiseOperation, MemOp>
+{
+    template <typename Desc_M>
+    static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize)
+    {
+        constexpr auto I0 = Number<0>{};
+
+        const auto m            = desc_m.GetLength(I0);
+        const index_t loop_step = gridSize * blockSize * InVectorSize;
+        const auto pad          = math::integer_least_multiple(m, loop_step) - m;
+        const auto desc_m_pad =
+            transform_tensor_descriptor(desc_m,
+                                        make_tuple(make_right_pad_transform(m, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return desc_m_pad;
+    }
+
+    static auto MakeDescriptor_M(index_t length, index_t gridSize, index_t blockSize)
+    {
+        const auto desc_m = make_naive_tensor_descriptor_packed(make_tuple(length));
+        return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
+    }
+
+    using InGrid1dDesc = decltype(MakeDescriptor_M(1, 1, 1));
+
+    using GridwisePutElement = GridwisePutElement_1D<InGrid1dDesc,
+                                                     InDataType,
+                                                     IndexDataType,
+                                                     OutDataType,
+                                                     ElementwiseOperation,
+                                                     MemOp,
+                                                     InVectorSize>;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_input,
+                 const IndexDataType* p_indices,
+                 OutDataType* p_output,
+                 index_t input_length,
+                 ElementwiseOperation elementwise_op)
+            : p_input_{p_input},
+              p_indices_{p_indices},
+              p_output_{p_output},
+              input_length_raw_{input_length},
+              elementwise_op_{elementwise_op},
+              blockSize_{256}
+        {
+        }
+
+        const InDataType* p_input_;
+        const IndexDataType* p_indices_;
+        OutDataType* p_output_;
+        index_t input_length_raw_;
+        ElementwiseOperation elementwise_op_;
+        index_t blockSize_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            index_t gridSize = getAvailableComputeUnitCount(stream_config);
+            InGrid1dDesc in_grid_desc =
+                MakeDescriptor_M(arg.input_length_raw_, gridSize, arg.blockSize_);
+
+            const auto kernel = kernel_put_element_1d<GridwisePutElement,
+                                                      InGrid1dDesc,
+                                                      InDataType,
+                                                      IndexDataType,
+                                                      OutDataType,
+                                                      ElementwiseOperation>;
+
+            float elapsed_time = launch_and_time_kernel(stream_config,
+                                                        kernel,
+                                                        dim3(gridSize),
+                                                        dim3(arg.blockSize_),
+                                                        0,
+                                                        in_grid_desc,
+                                                        arg.p_input_,
+                                                        arg.p_indices_,
+                                                        arg.p_output_,
+                                                        arg.elementwise_op_);
+            return elapsed_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+
+        if(pArg->input_length_raw_ % InVectorSize != 0)
+        {
+            return false;
+        }
+        return true;
+    }
+
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_input,
+                                                      const void* p_indices,
+                                                      void* p_output,
+                                                      index_t input_length,
+                                                      index_t,
+                                                      ElementwiseOperation elementwise_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_input),
+                                          static_cast<const IndexDataType*>(p_indices),
+                                          static_cast<OutDataType*>(p_output),
+                                          input_length,
+                                          elementwise_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp
new file mode 100644
index 000000000..8f72f88b0
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+
+namespace ck {
+
+template <typename GridwisePutElementwise1dFunctor,
+          typename InGrid1dDesc,
+          typename InDataType,
+          typename IndexDataType,
+          typename OutDataType,
+          typename ElementwiseOperation>
+__global__ void kernel_put_element_1d(const InGrid1dDesc in_grid_1d_desc,
+                                      const InDataType* __restrict__ p_in_global,
+                                      const IndexDataType* __restrict__ p_indices_global,
+                                      OutDataType* __restrict__ p_out_global,
+                                      const ElementwiseOperation elementwise_op)
+{
+    GridwisePutElementwise1dFunctor::Run(
+        in_grid_1d_desc, p_in_global, p_indices_global, p_out_global, elementwise_op);
+}
+
+// output[indices] = input
+template <typename InGrid1dDesc,
+          typename InDataType,
+          typename IndexDataType,
+          typename OutDataType,
+          typename ElementwiseOperation,
+          InMemoryDataOperationEnum MemOp,
+          index_t InVectorSize>
+struct GridwisePutElement_1D
+{
+    static constexpr auto I0 = Number<0>{};
+
+    static constexpr auto thread_buffer_desc_m =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<InVectorSize>{}));
+
+    __device__ static void Run(const InGrid1dDesc& in_grid_1d_desc,
+                               const InDataType* __restrict__ p_in_global,
+                               const IndexDataType* __restrict__ p_indices_global,
+                               OutDataType* __restrict__ p_out_global,
+                               const ElementwiseOperation& elementwise_op)
+    {
+        // Global Memory
+        const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_global, in_grid_1d_desc.GetElementSpaceSize());
+
+        const auto indices_global_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Global>(p_indices_global,
+                                                          in_grid_1d_desc.GetElementSpaceSize(),
+                                                          NumericLimits<IndexDataType>::Lowest());
+
+        // VGPR
+        StaticBuffer<AddressSpaceEnum::Vgpr, InDataType, InVectorSize, true> in_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, IndexDataType, InVectorSize, true> indices_thread_buf;
+
+        // Thread id, Block id and index
+        const index_t thread_global_id  = get_thread_global_1d_id();
+        const auto thread_global_offset = make_multi_index(thread_global_id * InVectorSize);
+        const index_t blockSize         = get_block_size();
+        const index_t blockPerGrid      = get_grid_size();
+        const auto M                    = in_grid_1d_desc.GetLength(I0);
+        const index_t loop_step         = blockPerGrid * blockSize * InVectorSize;
+        const auto loop_step_index      = make_multi_index(loop_step);
+
+        auto in_global_load =
+            ThreadwiseTensorSliceTransfer_v2<InDataType,
+                                             InDataType,
+                                             decltype(in_grid_1d_desc),
+                                             decltype(thread_buffer_desc_m),
+                                             Sequence<InVectorSize>, // SliceLengths
+                                             Sequence<0>,            // DimAccessOrder
+                                             0,                      // SrcVectorDim
+                                             InVectorSize,           // ScalarPerVector
+                                             1,                      // SrcScalarStrideInVector
+                                             false>{in_grid_1d_desc, thread_global_offset};
+
+        auto indices_global_load =
+            ThreadwiseTensorSliceTransfer_v2<IndexDataType,
+                                             IndexDataType,
+                                             decltype(in_grid_1d_desc),
+                                             decltype(thread_buffer_desc_m),
+                                             Sequence<InVectorSize>, // SliceLengths
+                                             Sequence<0>,            // DimAccessOrder
+                                             0,                      // SrcVectorDim
+                                             InVectorSize,           // ScalarPerVector
+                                             1,                      // SrcScalarStrideInVector
+                                             false>{in_grid_1d_desc, thread_global_offset};
+
+        index_t num_iter = M / loop_step;
+        do
+        {
+            in_global_load.Run(in_grid_1d_desc,
+                               in_global_buf,
+                               thread_buffer_desc_m,
+                               make_tuple(I0),
+                               in_thread_buf);
+
+            in_global_load.MoveSrcSliceWindow(in_grid_1d_desc, loop_step_index);
+
+            static_for<0, InVectorSize, 1>{}(
+                [&](auto iM) { elementwise_op(in_thread_buf(iM), in_thread_buf[iM]); });
+
+            indices_global_load.Run(in_grid_1d_desc,
+                                    indices_global_buf,
+                                    thread_buffer_desc_m,
+                                    make_tuple(I0),
+                                    indices_thread_buf);
+
+            indices_global_load.MoveSrcSliceWindow(in_grid_1d_desc, loop_step_index);
+
+            static_for<0, InVectorSize, 1>{}([&](auto iM) {
+                if(indices_thread_buf[iM] >= 0)
+                {
+                    if constexpr(MemOp == InMemoryDataOperationEnum::Set)
+                    {
+                        // User should guarantee each index in p_indices_global is different
+                        *(p_out_global + indices_thread_buf[iM]) =
+                            ck::type_convert<OutDataType>(in_thread_buf[iM]);
+                    }
+                    else if constexpr(MemOp == InMemoryDataOperationEnum::AtomicAdd)
+                    {
+                        atomic_add<OutDataType>(p_out_global + indices_thread_buf[iM],
+                                                ck::type_convert<OutDataType>(in_thread_buf[iM]));
+                    }
+                    else if constexpr(MemOp == InMemoryDataOperationEnum::AtomicMax)
+                    {
+                        atomic_max<OutDataType>(p_out_global + indices_thread_buf[iM],
+                                                ck::type_convert<OutDataType>(in_thread_buf[iM]));
+                    }
+                    else if constexpr(MemOp == InMemoryDataOperationEnum::Add)
+                    {
+                        // User should guarantee each index in p_indices_global is different
+                        *(p_out_global + indices_thread_buf[iM]) +=
+                            ck::type_convert<OutDataType>(in_thread_buf[iM]);
+                    }
+                    else
+                    {
+                        static_assert(MemOp == InMemoryDataOperationEnum::Set ||
+                                      MemOp == InMemoryDataOperationEnum::AtomicAdd ||
+                                      MemOp == InMemoryDataOperationEnum::AtomicMax ||
+                                      MemOp == InMemoryDataOperationEnum::Add);
+                    }
+                }
+            });
+
+        } while(--num_iter);
+    }
+};
+
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp
new file mode 100644
index 000000000..3f1fc6165
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+using namespace std;
+
+template <typename DOutDataType,
+          typename IndexDataType,
+          typename ConputeDataType,
+          typename DInDataType,
+          typename ElementwiseOperation>
+struct ReferenceMaxPoolBwd : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<DOutDataType>& dout,
+                 const Tensor<IndexDataType>& indices,
+                 Tensor<DInDataType>& din,
+                 ElementwiseOperation elementwise_op)
+            : dout_(dout), indices_(indices), din_(din), elementwise_op_(elementwise_op)
+        {
+        }
+
+        const Tensor<DOutDataType>& dout_;
+        const Tensor<IndexDataType>& indices_;
+        Tensor<DInDataType>& din_;
+        ElementwiseOperation elementwise_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            int din_length  = arg.din_.GetElementSpaceSize();
+            int dout_length = arg.dout_.GetElementSpaceSize();
+            std::vector<ConputeDataType> buf(din_length, 0);
+
+            for(int i = 0; i < dout_length; ++i)
+            {
+                int index = arg.indices_.mData[i];
+                if(index >= 0 && index < din_length)
+                    buf[index] += ck::type_convert<ConputeDataType>(arg.dout_.mData[i]);
+            }
+
+            for(int i = 0; i < din_length; ++i)
+                arg.din_.mData[i] = ck::type_convert<DInDataType>(buf[i]);
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<DOutDataType>& dout,
+                             const Tensor<IndexDataType>& indices,
+                             Tensor<DInDataType>& din,
+                             ElementwiseOperation elementwise_op)
+    {
+        return Argument{dout, indices, din, elementwise_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceMaxPoolBwd"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
index b4b7a5a03..696fb5eaf 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
@@ -100,8 +100,8 @@ struct ReferencePoolingFwd : public device::BaseOperator
                                    wi >= 0 &&
                                    wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[4]))
                                 {
-                                    ComputeDataType currVal =
-                                        static_cast<ComputeDataType>(arg.in_(n, c, di, hi, wi));
+                                    ComputeDataType currVal = ck::type_convert<ComputeDataType>(
+                                        arg.in_(n, c, di, hi, wi));
 
                                     in_elementwise_op(currVal, currVal);
 
@@ -112,7 +112,7 @@ struct ReferencePoolingFwd : public device::BaseOperator
                     }
                     acc_elementwise_op(accuVal, accuVal);
 
-                    arg.out_(n, c, do_, ho, wo) = accuVal;
+                    arg.out_(n, c, do_, ho, wo) = ck::type_convert<OutDataType>(accuVal);
                 };
 
                 make_ParallelTensorFunctor(f_ncdhw,
@@ -151,8 +151,8 @@ struct ReferencePoolingFwd : public device::BaseOperator
                                    wi >= 0 &&
                                    wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[4]))
                                 {
-                                    ComputeDataType currVal =
-                                        static_cast<ComputeDataType>(arg.in_(n, c, di, hi, wi));
+                                    ComputeDataType currVal = ck::type_convert<ComputeDataType>(
+                                        arg.in_(n, c, di, hi, wi));
                                     IndexDataType currIndex =
                                         arg.in_.GetOffsetFromMultiIndex(n, c, di, hi, wi);
 
@@ -166,7 +166,7 @@ struct ReferencePoolingFwd : public device::BaseOperator
 
                     acc_elementwise_op(accuVal, accuVal);
 
-                    arg.out_(n, c, do_, ho, wo)         = accuVal;
+                    arg.out_(n, c, do_, ho, wo)         = ck::type_convert<OutDataType>(accuVal);
                     arg.out_indices_(n, c, do_, ho, wo) = accuIndex;
                 };
 
@@ -212,7 +212,7 @@ struct ReferencePoolingFwd : public device::BaseOperator
                                wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[3]))
                             {
                                 ComputeDataType currVal =
-                                    static_cast<ComputeDataType>(arg.in_(n, c, hi, wi));
+                                    ck::type_convert<ComputeDataType>(arg.in_(n, c, hi, wi));
 
                                 in_elementwise_op(currVal, currVal);
 
@@ -222,7 +222,7 @@ struct ReferencePoolingFwd : public device::BaseOperator
                     }
 
                     acc_elementwise_op(accuVal, accuVal);
-                    arg.out_(n, c, ho, wo) = accuVal;
+                    arg.out_(n, c, ho, wo) = ck::type_convert<OutDataType>(accuVal);
                 };
 
                 make_ParallelTensorFunctor(f_nchw,
@@ -255,7 +255,7 @@ struct ReferencePoolingFwd : public device::BaseOperator
                                wi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[3]))
                             {
                                 ComputeDataType currVal =
-                                    static_cast<ComputeDataType>(arg.in_(n, c, hi, wi));
+                                    ck::type_convert<ComputeDataType>(arg.in_(n, c, hi, wi));
 
                                 IndexDataType currIndex =
                                     arg.in_.GetOffsetFromMultiIndex(n, c, hi, wi);
@@ -268,7 +268,7 @@ struct ReferencePoolingFwd : public device::BaseOperator
                     }
 
                     acc_elementwise_op(accuVal, accuVal);
-                    arg.out_(n, c, ho, wo)         = accuVal;
+                    arg.out_(n, c, ho, wo)         = ck::type_convert<OutDataType>(accuVal);
                     arg.out_indices_(n, c, ho, wo) = accuIndex;
                 };
 
-- 
GitLab


From f0c620c42e753432ded96040abdac1bbae89aab5 Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Mon, 19 Jun 2023 11:20:35 -0500
Subject: [PATCH 66/71] FP8 enablement - add a pseudorandom number generator,
 add conversion methods (#708)

* Add basic fp8 definitions and prn-generator

* Format

* Add fp8<->fp32 type_convert

* Format

* Split type_convert and cast_to/from_f8

* Format

* Minor fix

* Minor fix

* Move fp8 utils to a separate header

* Add elementwise ops

* Add fp8_convert_sr

* Format

* Add element op

* Eliminate magic numbers

* Split f8_convert_sr in host and device

* Format

* Add some constexpr

* Add a datatype test

* Format

* Another format

* Add fp8<->fp16 tests

* Update type_converts

* Format

* Add fp16 casting functions

* Format

* Use seed as a runtime arg

* Use element location for PRNG

* Format

* Add fp8<->fp16 to PassThrough element op

* Clean up

* Merge host and device implementations

* Add comments on rounding modes

* Remove leftover code

* Put type_converts into a separate header

* Put random number gen to a separate header

* Rearrange f8_utils' namespaces

* Refactor type_convert.hpp

* Move f8_t definition
---
 .../element/unary_element_wise_operation.hpp  |  48 ++++
 include/ck/utility/common_header.hpp          |   1 +
 include/ck/utility/data_type.hpp              | 177 +++----------
 include/ck/utility/f8_utils.hpp               | 250 ++++++++++++++++++
 include/ck/utility/inner_product.hpp          |   1 +
 include/ck/utility/random_gen.hpp             |  53 ++++
 include/ck/utility/reduction_operator.hpp     |   1 +
 include/ck/utility/type_convert.hpp           | 230 ++++++++++++++++
 .../ck/library/utility/host_tensor.hpp        |   1 +
 test/data_type/CMakeLists.txt                 |   3 +
 test/data_type/fp8.cpp                        | 123 +++++++++
 11 files changed, 743 insertions(+), 145 deletions(-)
 create mode 100644 include/ck/utility/f8_utils.hpp
 create mode 100644 include/ck/utility/random_gen.hpp
 create mode 100644 include/ck/utility/type_convert.hpp
 create mode 100644 test/data_type/fp8.cpp

diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index c3e7706ef..4fb061fad 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -6,6 +6,7 @@
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/math.hpp"
 #include "ck/utility/math_v2.hpp"
+#include "ck/utility/type_convert.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -81,6 +82,36 @@ struct PassThrough
         y = x;
     }
 #endif
+
+    template <>
+    __host__ __device__ void operator()<f8_t, f8_t>(f8_t& y, const f8_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    __host__ __device__ void operator()<float, f8_t>(float& y, const f8_t& x) const
+    {
+        y = type_convert<float>(x);
+    }
+
+    template <>
+    __host__ __device__ void operator()<f8_t, float>(f8_t& y, const float& x) const
+    {
+        y = type_convert<f8_t>(x);
+    }
+
+    template <>
+    __host__ __device__ void operator()<half_t, f8_t>(half_t& y, const f8_t& x) const
+    {
+        y = type_convert<half_t>(x);
+    }
+
+    template <>
+    __host__ __device__ void operator()<f8_t, half_t>(f8_t& y, const half_t& x) const
+    {
+        y = type_convert<f8_t>(x);
+    }
 };
 
 struct UnaryConvert
@@ -109,6 +140,23 @@ struct ConvertBF16RTN
     }
 };
 
+struct ConvertF8SR
+{
+    // convert to fp8 using stochastic rounding (SR)
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const
+    {
+        // check Y datatype
+        static_assert(is_same<Y, f8_t>::value, "Data type is not supported by this operation!");
+
+        // check X datatype
+        static_assert(is_same<X, float>::value || is_same<X, half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        y = f8_convert_sr<Y>(x);
+    }
+};
+
 struct Scale
 {
     __host__ __device__ Scale(float scale) : scale_(scale) {}
diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp
index 41a9d0b58..f95660a8a 100644
--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
@@ -24,6 +24,7 @@
 #include "ck/utility/tuple.hpp"
 #include "ck/utility/tuple_helper.hpp"
 #include "ck/utility/type.hpp"
+#include "ck/utility/type_convert.hpp"
 #include "ck/utility/magic_division.hpp"
 #include "ck/utility/c_style_pointer_cast.hpp"
 #include "ck/utility/is_known_at_compile_time.hpp"
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index d43af8a2e..c240afa2b 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -12,6 +12,7 @@ using half_t  = _Float16;
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 using int4_t = _BitInt(4);
 #endif
+using f8_t = uint8_t;
 
 // vector_type
 template <typename T, index_t N>
@@ -142,6 +143,13 @@ struct scalar_type<int4_t>
 };
 #endif
 
+template <>
+struct scalar_type<f8_t>
+{
+    using type                           = f8_t;
+    static constexpr index_t vector_size = 1;
+};
+
 //
 template <typename T>
 struct vector_type<T, 1>
@@ -944,151 +952,13 @@ using int8x16_t = typename vector_type<int8_t, 16>::type;
 using int8x32_t = typename vector_type<int8_t, 32>::type;
 using int8x64_t = typename vector_type<int8_t, 64>::type;
 
-// Convert X to Y
-template <typename Y, typename X>
-__host__ __device__ constexpr Y type_convert(X x)
-{
-    static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
-
-    return static_cast<Y>(x);
-}
-
-// convert bfp16 to fp32
-template <>
-inline __host__ __device__ constexpr float type_convert<float, bhalf_t>(bhalf_t x)
-{
-    union
-    {
-        uint32_t int32;
-        float fp32;
-    } u = {uint32_t(x) << 16};
-
-    return u.fp32;
-}
-
-// convert fp32 to bfp16
-template <>
-inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float x)
-{
-    union
-    {
-        float fp32;
-        uint32_t int32;
-    } u = {x};
-
-    return uint16_t(u.int32 >> 16);
-}
-
-// convert bfp16 to fp16 via fp32
-template <>
-inline __host__ __device__ constexpr half_t type_convert<half_t, bhalf_t>(bhalf_t x)
-{
-    float x_fp32 = type_convert<float>(x);
-
-    return static_cast<half_t>(x_fp32);
-}
-
-// convert fp16 to bfp16 via fp32
-template <>
-inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, half_t>(half_t x)
-{
-    float x_fp32 = static_cast<float>(x);
-
-    return type_convert<bhalf_t>(x_fp32);
-}
-
-// convert bfp16 to int32 via fp32
-template <>
-inline __host__ __device__ constexpr int32_t type_convert<int32_t, bhalf_t>(bhalf_t x)
-{
-    float x_fp32 = type_convert<float>(x);
-
-    return static_cast<int32_t>(x_fp32);
-}
-
-// convert int32 to bfp16 via fp32
-template <>
-inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int32_t>(int32_t x)
-{
-    float x_fp32 = static_cast<float>(x);
-
-    return type_convert<bhalf_t>(x_fp32);
-}
-
-// convert bfp16 to int8 via fp32
-template <>
-inline __host__ __device__ constexpr int8_t type_convert<int8_t, bhalf_t>(bhalf_t x)
-{
-    float x_fp32 = type_convert<float>(x);
-
-    return static_cast<int8_t>(x_fp32);
-}
-
-// convert int8 to bfp16 via fp32
-template <>
-inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_t x)
-{
-    float x_fp32 = static_cast<float>(x);
-
-    return type_convert<bhalf_t>(x_fp32);
-}
-
-// Declare a template function for bf16 conversion using RTN
-template <typename Y, typename X>
-__host__ __device__ constexpr Y bf16_convert_rtn(X x);
-
-// Convert fp32 to bf16 with RTN if higher precision is needed
-template <>
-inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, float>(float x)
-{
-    union
-    {
-        float fp32;
-        uint32_t int32;
-    } u = {x};
-
-    // When the exponent bits are not all 1s, then the value is zero, normal,
-    // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
-    // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
-    // This causes the bfloat16's mantissa to be incremented by 1 if the 16
-    // least significant bits of the float mantissa are greater than 0x8000,
-    // or if they are equal to 0x8000 and the least significant bit of the
-    // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
-    // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
-    // has the value 0x7f, then incrementing it causes it to become 0x00 and
-    // the exponent is incremented by one, which is the next higher FP value
-    // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
-    // with an exponent of 0x00 and a mantissa of 0x7f, it may be rounded up
-    // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
-    // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
-    // incrementing it causes it to become an exponent of 0xFF and a mantissa
-    // of 0x00, which is Inf, the next higher value to the unrounded value.
-    bool flag0 = ~u.int32 & 0x7f800000;
-
-    // When all of the exponent bits are 1, the value is Inf or NaN.
-    // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
-    // mantissa bit. Quiet NaN is indicated by the most significant mantissa
-    // bit being 1. Signaling NaN is indicated by the most significant
-    // mantissa bit being 0 but some other bit(s) being 1. If any of the
-    // lower 16 bits of the mantissa are 1, we set the least significant bit
-    // of the bfloat16 mantissa, in order to preserve signaling NaN in case
-    // the bfloat16's mantissa bits are all 0.
-    bool flag1 = !flag0 && (u.int32 & 0xffff);
-
-    u.int32 += flag0 ? 0x7fff + ((u.int32 >> 16) & 1) : 0; // Round to nearest, round to even
-    u.int32 |= flag1 ? 0x10000 : 0x0;                      // Preserve signaling NaN
-
-    return uint16_t(u.int32 >> 16);
-}
-
-// convert fp16 to bfp16 via fp32 with RTN if higher precision is needed
-template <>
-inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, half_t>(half_t x)
-{
-    float x_fp32 = static_cast<float>(x);
-
-    return bf16_convert_rtn<bhalf_t>(x_fp32);
-}
+// f8
+using f8x2_t  = typename vector_type<f8_t, 2>::type;
+using f8x4_t  = typename vector_type<f8_t, 4>::type;
+using f8x8_t  = typename vector_type<f8_t, 8>::type;
+using f8x16_t = typename vector_type<f8_t, 16>::type;
+using f8x32_t = typename vector_type<f8_t, 32>::type;
+using f8x64_t = typename vector_type<f8_t, 64>::type;
 
 template <typename T>
 struct NumericLimits
@@ -1136,4 +1006,21 @@ struct NumericLimits<int4_t>
 };
 #endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 
+template <>
+struct NumericLimits<f8_t>
+{
+    static constexpr uint8_t binary_min    = 0x08; // 0b00001000
+    static constexpr uint8_t binary_max    = 0x77; // 0b01110111
+    static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111
+    static constexpr uint8_t binary_qnan   = 0x80; // 0b10000000
+
+    __host__ __device__ static constexpr f8_t Min() { return bit_cast<f8_t>(binary_min); }
+
+    __host__ __device__ static constexpr f8_t Max() { return bit_cast<f8_t>(binary_max); }
+
+    __host__ __device__ static constexpr f8_t Lowest() { return bit_cast<f8_t>(binary_lowest); }
+
+    __host__ __device__ static constexpr f8_t QuietNaN() { return bit_cast<f8_t>(binary_qnan); }
+};
+
 } // namespace ck
diff --git a/include/ck/utility/f8_utils.hpp b/include/ck/utility/f8_utils.hpp
new file mode 100644
index 000000000..bb13f9815
--- /dev/null
+++ b/include/ck/utility/f8_utils.hpp
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+
+// fp8 rounding modes
+// use standard for rounding to nearest, the faster one
+// use stochastic for stochastic rounding, helps to avoid error accumulation
+enum class f8_rounding_mode
+{
+    standard,
+    stochastic
+};
+
+} // namespace ck
+
+namespace ck::utils {
+
+namespace {
+
+template <typename T, bool negative_zero_nan, bool clip, bool stoch>
+__host__ __device__ f8_t run_cast_to_f8(T x, uint32_t rng)
+{
+    // check data type
+    constexpr bool is_half  = std::is_same<T, half_t>::value;
+    constexpr bool is_float = std::is_same<T, float>::value;
+
+    // fp8 exponent/mantissa layout
+    constexpr int f8_exp  = 4;
+    constexpr int f8_mant = 3;
+
+    // resulting type exponent/mantissa layout
+    constexpr int type_exp  = is_half ? 5 : 8;
+    constexpr int type_mant = is_half ? 10 : 23;
+
+    int exponent;
+    uint32_t head, mantissa, sign;
+    // nan code is same for float and half
+    constexpr uint8_t nan_code  = 0x80;
+    constexpr uint32_t nan_mask = is_half ? 0x7C00 : 0x7F800000;
+
+    // convert to bitwise
+    typedef typename std::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type
+        T_bitwise;
+    T_bitwise x_bitwise = *(reinterpret_cast<T_bitwise*>(&x));
+
+    // unpack the input, depends on datatype
+    if constexpr(is_float)
+    {
+        head     = x_bitwise & 0xFF800000;
+        mantissa = x_bitwise & 0x7FFFFF;
+        exponent = (head >> type_mant) & 0xFF;
+        sign     = head >> (type_exp + type_mant);
+    }
+    else if constexpr(is_half)
+    {
+        head     = x_bitwise & 0xFC00;
+        mantissa = x_bitwise & 0x3FF;
+        exponent = (head >> type_mant) & 0x1F;
+        sign     = head >> (type_exp + type_mant);
+    }
+
+    uint32_t signed_inf   = (sign << (type_exp + type_mant)) + (((1 << type_exp) - 1) << type_mant);
+    uint32_t drop_mask    = (1 << (type_mant - f8_mant)) - 1;
+    constexpr int max_exp = (1 << f8_exp) - (negative_zero_nan ? 1 : 2);
+    constexpr int exp_low_cutoff =
+        (1 << (type_exp - 1)) - (1 << (f8_exp - 1)) + 1 - (negative_zero_nan ? 1 : 0);
+
+    if constexpr(negative_zero_nan)
+    {
+        if((x_bitwise & nan_mask) == nan_mask)
+            return nan_code;
+    }
+    else
+    {
+        if((x_bitwise & nan_mask) == nan_mask)
+            return signed_inf + (mantissa != 0 ? 1 : 0);
+    }
+
+    // check if x is 0.0
+    if(x_bitwise == 0)
+        return 0;
+
+    exponent -= exp_low_cutoff - 1;
+    if(exponent <= 0)
+        drop_mask = (1 << (type_mant - f8_mant + 1 - exponent)) - 1;
+    mantissa += 1 << type_mant;
+    // apply random number if needed
+    mantissa += (stoch ? rng : mantissa) & drop_mask;
+    if(mantissa >= (2 << type_mant))
+    {
+        mantissa >>= 1;
+        exponent++;
+    }
+    mantissa >>= (type_mant - f8_mant);
+
+    // check negative exponent
+    if(exponent <= 0)
+    {
+        if(x_bitwise == 0)
+            return 0;
+        else
+        {
+            // subnormal range; represented by a subnormal float8 (exponent 0)
+            // and involves loss of accuracy
+            mantissa >>= 1 - exponent;
+            exponent = 0;
+        }
+    }
+    // above range: quantize to maximum possible float of the same sign
+    else if(exponent > max_exp)
+    {
+        if(clip)
+        {
+            mantissa = (1 << f8_mant) - 1;
+            exponent = max_exp;
+        }
+        else
+        {
+            return signed_inf;
+        }
+    }
+
+    // check if x is 0.0 or -0.0
+    if(exponent == 0 && mantissa == 0)
+        return negative_zero_nan ? 0 : (sign << (f8_exp + f8_mant));
+    mantissa &= (1 << f8_mant) - 1;
+    return (sign << (f8_exp + f8_mant)) | (exponent << f8_mant) | mantissa;
+}
+
+template <typename T, bool negative_zero_nan>
+__host__ __device__ T run_cast_from_f8(f8_t x)
+{
+    // check data type
+    constexpr bool is_half  = std::is_same<T, half_t>::value;
+    constexpr bool is_float = std::is_same<T, float>::value;
+
+    // fp8 exponent/mantissa layout
+    constexpr int f8_exp  = 4;
+    constexpr int f8_mant = 3;
+
+    // resulting type exponent/mantissa layout
+    constexpr int type_exp  = is_half ? 5 : 8;
+    constexpr int type_mant = is_half ? 10 : 23;
+
+    // prepare the codes
+    constexpr uint8_t nan_code = 0x80;
+    T fInf, fNegInf, fNaN, fNeg0;
+    if constexpr(is_half)
+    {
+        constexpr uint16_t ihInf    = 0x7C00;
+        constexpr uint16_t ihNegInf = 0xFC00;
+        constexpr uint16_t ihNaN    = 0x7C01;
+        constexpr uint16_t ihNeg0   = 0x8000;
+        fInf                        = *(reinterpret_cast<const half_t*>(&ihInf));
+        fNegInf                     = *(reinterpret_cast<const half_t*>(&ihNegInf));
+        fNaN                        = *(reinterpret_cast<const half_t*>(&ihNaN));
+        fNeg0                       = *(reinterpret_cast<const half_t*>(&ihNeg0));
+    }
+    else if constexpr(is_float)
+    {
+        constexpr uint32_t ifInf    = 0x7F800000;
+        constexpr uint32_t ifNegInf = 0xFF800000;
+        constexpr uint32_t ifNaN    = 0x7F800001;
+        constexpr uint32_t ifNeg0   = 0x80000000;
+        fInf                        = *(reinterpret_cast<const float*>(&ifInf));
+        fNegInf                     = *(reinterpret_cast<const float*>(&ifNegInf));
+        fNaN                        = *(reinterpret_cast<const float*>(&ifNaN));
+        fNeg0                       = *(reinterpret_cast<const float*>(&ifNeg0));
+    }
+
+    // unpack the input
+    uint32_t sign     = x >> (f8_exp + f8_mant);
+    uint32_t mantissa = x & ((1 << f8_mant) - 1);
+    int exponent      = (x & 0x7F) >> f8_mant;
+
+    constexpr int exp_low_cutoff =
+        (1 << (type_exp - 1)) - (1 << (f8_exp - 1)) + 1 - (negative_zero_nan ? 1 : 0);
+    typename std::conditional<std::is_same<T, half_t>::value, uint16_t, uint32_t>::type retval;
+
+    if constexpr(negative_zero_nan)
+    {
+        if(x == nan_code)
+            return fNaN;
+    }
+    else
+    {
+        if(x == nan_code)
+            return fNeg0;
+        if(exponent == ((1 << f8_exp) - 1))
+            return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
+    }
+
+    // subnormal input
+    if(exponent == 0)
+    {
+        // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+        int sh = 1 + __builtin_clz(mantissa) - ((1 + type_exp + type_mant) - f8_mant);
+        mantissa <<= sh;
+        mantissa &= ((1 << f8_mant) - 1);
+        exponent += 1 - sh;
+    }
+    exponent += exp_low_cutoff - 1;
+    mantissa <<= type_mant - f8_mant;
+
+    // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
+    if(exponent <= 0)
+    {
+        mantissa |= 1 << type_mant;
+        mantissa >>= 1 - exponent;
+        exponent = 0;
+    }
+
+    retval = (sign << (type_exp + type_mant)) | (exponent << type_mant) | mantissa;
+    return *(reinterpret_cast<const T*>(&retval));
+}
+
+} // namespace
+
+template <typename T, bool negative_zero_nan, bool clip, bool stoch>
+__host__ __device__ f8_t cast_to_f8(T x, uint32_t rng)
+{
+    // check datatype
+    constexpr bool is_half  = std::is_same<T, half_t>::value;
+    constexpr bool is_float = std::is_same<T, float>::value;
+    static_assert(is_half || is_float, "Only half and float can be casted to f8.");
+
+    return run_cast_to_f8<T, negative_zero_nan, clip, stoch>(x, rng);
+}
+
+template <typename T, bool negative_zero_nan>
+__host__ __device__ T cast_from_f8(f8_t x)
+{
+    // check datatype
+    constexpr bool is_half  = std::is_same<T, half_t>::value;
+    constexpr bool is_float = std::is_same<T, float>::value;
+    static_assert(is_half || is_float, "only half and float are supported.");
+
+    // check if x is 0.0
+    if(x == 0)
+        return static_cast<T>(0);
+
+    return run_cast_from_f8<T, negative_zero_nan>(x);
+}
+
+} // namespace ck::utils
diff --git a/include/ck/utility/inner_product.hpp b/include/ck/utility/inner_product.hpp
index 7828d21d7..b13bccb5a 100644
--- a/include/ck/utility/inner_product.hpp
+++ b/include/ck/utility/inner_product.hpp
@@ -3,6 +3,7 @@
 
 #pragma once
 #include "data_type.hpp"
+#include "type_convert.hpp"
 
 namespace ck {
 
diff --git a/include/ck/utility/random_gen.hpp b/include/ck/utility/random_gen.hpp
new file mode 100644
index 000000000..b7edf2650
--- /dev/null
+++ b/include/ck/utility/random_gen.hpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+
+// Pseudo random number generator
+// version for fp32
+template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<float, T>{}, bool> = false>
+__host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
+{
+    uint32_t x         = *(reinterpret_cast<uint32_t*>(&val));
+    uint32_t drop_bits = uint32_t(x) & 0xFFFFu;
+    drop_bits ^= x >> 16;
+    drop_bits = ((drop_bits & 31) << 11) | (drop_bits >> 5);
+    drop_bits *= 0x7000149;
+    // NOTE: If id is in 64 bit, we are only using lower 32 bit.
+    //       So, it can have an effect of using same id for multiple elements when the id is very
+    //       large!
+    uint32_t rng = (drop_bits ^ 0x13371337 ^ (id * 229791) ^ seed);
+    return rng;
+}
+
+// version for fp16
+template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<half_t, T>{}, bool> = false>
+__host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
+{
+    uint16_t x         = *(reinterpret_cast<uint16_t*>(&val));
+    uint32_t drop_bits = uint32_t(x) & 0xFFFFu;
+    drop_bits          = ((drop_bits & 31) << 11) | (drop_bits >> 5);
+    drop_bits *= 0x7000149;
+    // NOTE: If id is in 64 bit, we are only using lower 32 bit.
+    //       So, it can have an effect of using same id for multiple elements when the id is very
+    //       large!
+    uint32_t rng = (drop_bits ^ 0x13371337 ^ (id * 229791) ^ seed);
+    return rng;
+}
+
+// return 0 if data is not fp16 or fp32
+template <typename T,
+          uint32_t seed_t,
+          std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<half_t, T>{}), bool> = false>
+__host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
+{
+    std::ignore = id;
+    std::ignore = val;
+    std::ignore = seed;
+
+    return 0;
+}
+
+} // namespace ck
diff --git a/include/ck/utility/reduction_operator.hpp b/include/ck/utility/reduction_operator.hpp
index 0f5b73cb0..36c25203e 100644
--- a/include/ck/utility/reduction_operator.hpp
+++ b/include/ck/utility/reduction_operator.hpp
@@ -6,6 +6,7 @@
 #include "ck/ck.hpp"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type.hpp"
+#include "ck/utility/type_convert.hpp"
 
 namespace ck {
 
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
new file mode 100644
index 000000000..ed8396493
--- /dev/null
+++ b/include/ck/utility/type_convert.hpp
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/f8_utils.hpp"
+#include "ck/utility/random_gen.hpp"
+
+namespace ck {
+
+// Convert X to Y
+template <typename Y, typename X>
+__host__ __device__ constexpr Y type_convert(X x)
+{
+    static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
+
+    return static_cast<Y>(x);
+}
+
+// convert bfp16 to fp32
+template <>
+inline __host__ __device__ constexpr float type_convert<float, bhalf_t>(bhalf_t x)
+{
+    union
+    {
+        uint32_t int32;
+        float fp32;
+    } u = {uint32_t(x) << 16};
+
+    return u.fp32;
+}
+
+// convert fp32 to bfp16
+template <>
+inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float x)
+{
+    union
+    {
+        float fp32;
+        uint32_t int32;
+    } u = {x};
+
+    return uint16_t(u.int32 >> 16);
+}
+
+// convert bfp16 to fp16 via fp32
+template <>
+inline __host__ __device__ constexpr half_t type_convert<half_t, bhalf_t>(bhalf_t x)
+{
+    float x_fp32 = type_convert<float>(x);
+
+    return static_cast<half_t>(x_fp32);
+}
+
+// convert fp16 to bfp16 via fp32
+template <>
+inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, half_t>(half_t x)
+{
+    float x_fp32 = static_cast<float>(x);
+
+    return type_convert<bhalf_t>(x_fp32);
+}
+
+// convert bfp16 to int32 via fp32
+template <>
+inline __host__ __device__ constexpr int32_t type_convert<int32_t, bhalf_t>(bhalf_t x)
+{
+    float x_fp32 = type_convert<float>(x);
+
+    return static_cast<int32_t>(x_fp32);
+}
+
+// convert int32 to bfp16 via fp32
+template <>
+inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int32_t>(int32_t x)
+{
+    float x_fp32 = static_cast<float>(x);
+
+    return type_convert<bhalf_t>(x_fp32);
+}
+
+// convert bfp16 to int8 via fp32
+template <>
+inline __host__ __device__ constexpr int8_t type_convert<int8_t, bhalf_t>(bhalf_t x)
+{
+    float x_fp32 = type_convert<float>(x);
+
+    return static_cast<int8_t>(x_fp32);
+}
+
+// convert int8 to bfp16 via fp32
+template <>
+inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_t x)
+{
+    float x_fp32 = static_cast<float>(x);
+
+    return type_convert<bhalf_t>(x_fp32);
+}
+
+// convert fp32 to fp8
+template <>
+inline __host__ __device__ f8_t type_convert<f8_t, float>(float x)
+{
+    constexpr bool negative_zero_nan = true;
+    constexpr bool clip              = true;
+    constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
+    constexpr uint32_t rng           = 0;
+    return utils::cast_to_f8<float, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
+        x, rng);
+}
+
+// convert fp8 to fp32
+template <>
+inline __host__ __device__ float type_convert<float, f8_t>(f8_t x)
+{
+    constexpr bool negative_zero_nan = true;
+    return utils::cast_from_f8<float, negative_zero_nan>(x);
+}
+
+// convert fp16 to fp8
+template <>
+inline __host__ __device__ f8_t type_convert<f8_t, half_t>(half_t x)
+{
+    constexpr bool negative_zero_nan = true;
+    constexpr bool clip              = true;
+    constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
+    constexpr uint32_t rng           = 0;
+    return utils::cast_to_f8<half_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
+        x, rng);
+}
+
+// convert fp8 to fp16
+template <>
+inline __host__ __device__ half_t type_convert<half_t, f8_t>(f8_t x)
+{
+    constexpr bool negative_zero_nan = true;
+    return utils::cast_from_f8<half_t, negative_zero_nan>(x);
+}
+
+// Declare a template function for bf16 conversion using RTN
+template <typename Y, typename X>
+__host__ __device__ constexpr Y bf16_convert_rtn(X x);
+
+// Convert fp32 to bf16 with RTN if higher precision is needed
+template <>
+inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, float>(float x)
+{
+    union
+    {
+        float fp32;
+        uint32_t int32;
+    } u = {x};
+
+    // When the exponent bits are not all 1s, then the value is zero, normal,
+    // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+    // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+    // This causes the bfloat16's mantissa to be incremented by 1 if the 16
+    // least significant bits of the float mantissa are greater than 0x8000,
+    // or if they are equal to 0x8000 and the least significant bit of the
+    // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+    // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+    // has the value 0x7f, then incrementing it causes it to become 0x00 and
+    // the exponent is incremented by one, which is the next higher FP value
+    // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+    // with an exponent of 0x00 and a mantissa of 0x7f, it may be rounded up
+    // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+    // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+    // incrementing it causes it to become an exponent of 0xFF and a mantissa
+    // of 0x00, which is Inf, the next higher value to the unrounded value.
+    bool flag0 = ~u.int32 & 0x7f800000;
+
+    // When all of the exponent bits are 1, the value is Inf or NaN.
+    // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+    // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+    // bit being 1. Signaling NaN is indicated by the most significant
+    // mantissa bit being 0 but some other bit(s) being 1. If any of the
+    // lower 16 bits of the mantissa are 1, we set the least significant bit
+    // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+    // the bfloat16's mantissa bits are all 0.
+    bool flag1 = !flag0 && (u.int32 & 0xffff);
+
+    u.int32 += flag0 ? 0x7fff + ((u.int32 >> 16) & 1) : 0; // Round to nearest, round to even
+    u.int32 |= flag1 ? 0x10000 : 0x0;                      // Preserve signaling NaN
+
+    return uint16_t(u.int32 >> 16);
+}
+
+// convert fp16 to bfp16 via fp32 with RTN if higher precision is needed
+template <>
+inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, half_t>(half_t x)
+{
+    float x_fp32 = static_cast<float>(x);
+
+    return bf16_convert_rtn<bhalf_t>(x_fp32);
+}
+
+// Declare a template function for fp8 conversion using SR
+template <typename Y, typename X>
+__host__ __device__ constexpr Y f8_convert_sr(X x);
+
+// convert fp32 to fp8 with stochastic rounding
+template <>
+inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
+{
+    constexpr bool negative_zero_nan = true;
+    constexpr bool clip              = true;
+    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
+    constexpr int seed               = 42;
+    // as thread id is not available on host, use 0 for prn generation
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    return utils::cast_to_f8<float, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
+        x, rng);
+}
+
+// convert fp16 to fp8 with stochastic rounding
+template <>
+inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x)
+{
+    constexpr bool negative_zero_nan = true;
+    constexpr bool clip              = true;
+    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
+    constexpr int seed               = 42;
+    // as thread id is not available on host, use 0 for prn generation
+    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    return utils::cast_to_f8<half_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
+        x, rng);
+}
+
+} // namespace ck
diff --git a/library/include/ck/library/utility/host_tensor.hpp b/library/include/ck/library/utility/host_tensor.hpp
index 91293d29f..816d83413 100644
--- a/library/include/ck/library/utility/host_tensor.hpp
+++ b/library/include/ck/library/utility/host_tensor.hpp
@@ -13,6 +13,7 @@
 
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/span.hpp"
+#include "ck/utility/type_convert.hpp"
 
 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/ranges.hpp"
diff --git a/test/data_type/CMakeLists.txt b/test/data_type/CMakeLists.txt
index 088fbfec7..2b63727f1 100644
--- a/test/data_type/CMakeLists.txt
+++ b/test/data_type/CMakeLists.txt
@@ -2,3 +2,6 @@ if (USE_BITINT_EXTENSION_INT4)
   add_gtest_executable(test_int4 int4.cpp)
   target_link_libraries(test_int4 PRIVATE utility)
 endif()
+
+add_gtest_executable(test_fp8 fp8.cpp)
+target_link_libraries(test_fp8 PRIVATE utility)
diff --git a/test/data_type/fp8.cpp b/test/data_type/fp8.cpp
new file mode 100644
index 000000000..5004fe952
--- /dev/null
+++ b/test/data_type/fp8.cpp
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/type_convert.hpp"
+
+using ck::f8_convert_sr;
+using ck::f8_t;
+using ck::half_t;
+using ck::type_convert;
+
+TEST(FP8, NumericLimits)
+{
+    EXPECT_EQ(ck::NumericLimits<f8_t>::Min(), 0x08);
+    EXPECT_EQ(ck::NumericLimits<f8_t>::Max(), 0x77);
+    EXPECT_EQ(ck::NumericLimits<f8_t>::Lowest(), 0xF7);
+    EXPECT_EQ(ck::NumericLimits<f8_t>::QuietNaN(), 0x80);
+}
+
+TEST(FP8, ConvertFP32Nearest)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-6;
+    // convert 0 float to fp8 and back, check if holds
+    ASSERT_NEAR(0.0f, type_convert<float>(type_convert<f8_t>(0.0f)), abs_tol);
+    // convert minimal float to fp8 and back, check if holds
+    ASSERT_NEAR(std::numeric_limits<float>::min(),
+                type_convert<float>(type_convert<f8_t>(std::numeric_limits<float>::min())),
+                abs_tol);
+    // convert maximal f8_t to float and check if equal to 240.0
+    ASSERT_NEAR(240.0f, type_convert<float>(type_convert<f8_t>(240.0f)), abs_tol);
+    // convert maximal float to fp8 and back, check if clipped to 240.0
+    ASSERT_NEAR(240.0f,
+                type_convert<float>(type_convert<f8_t>(std::numeric_limits<float>::max())),
+                abs_tol);
+    // convert inf float to f8_t and check if it is qNan
+    ASSERT_NEAR(0x80, type_convert<f8_t>(std::numeric_limits<float>::infinity()), abs_tol);
+    // positive float value to fp8 and back, check if holds
+    float pos_float = 0.0078125f;
+    ASSERT_NEAR(pos_float, type_convert<float>(type_convert<f8_t>(pos_float)), abs_tol);
+    // negative float value to fp8 and back, check if holds
+    float neg_float = -0.0156250f;
+    ASSERT_NEAR(neg_float, type_convert<float>(type_convert<f8_t>(neg_float)), abs_tol);
+}
+
+TEST(FP8, ConvertFP32Stochastic)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-6;
+    // convert 0 float to fp8 and back, check if holds
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_sr<f8_t>(0.0f)), abs_tol);
+    // convert minimal float to fp8 and back, check if holds
+    ASSERT_NEAR(std::numeric_limits<float>::min(),
+                type_convert<float>(f8_convert_sr<f8_t>(std::numeric_limits<float>::min())),
+                abs_tol);
+    // convert maximal f8_t to float and check if equal to 240.0
+    ASSERT_NEAR(240.0f, type_convert<float>(f8_convert_sr<f8_t>(240.0f)), abs_tol);
+    // convert maximal float to fp8 and back, check if clipped to 240.0
+    ASSERT_NEAR(240.0f,
+                type_convert<float>(f8_convert_sr<f8_t>(std::numeric_limits<float>::max())),
+                abs_tol);
+    // convert inf float to f8_t and check if it is qNan
+    ASSERT_NEAR(0x80, f8_convert_sr<f8_t>(std::numeric_limits<float>::infinity()), abs_tol);
+    // positive float value to fp8 and back, check if holds
+    float pos_float = 0.0078125f;
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<f8_t>(pos_float)), abs_tol);
+    // negative float value to fp8 and back, check if holds
+    float neg_float = -0.0156250f;
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<f8_t>(neg_float)), abs_tol);
+}
+
+TEST(FP8, ConvertFP16Nearest)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-3;
+    // convert 0 fp16 to fp8 and back, check if holds
+    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(type_convert<f8_t>(half_t{0.0})), abs_tol);
+    // convert minimal fp16 to fp8 and back, check if holds
+    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
+                type_convert<half_t>(type_convert<f8_t>(ck::NumericLimits<half_t>::Min())),
+                abs_tol);
+    // convert maximal f8_t to fp16 and check if equal to 240.0
+    ASSERT_NEAR(half_t{240.0}, type_convert<half_t>(type_convert<f8_t>(half_t{240.0})), abs_tol);
+    // convert maximal fp16 to fp8 and back, check if clipped to 240.0
+    ASSERT_NEAR(half_t{240.0},
+                type_convert<half_t>(type_convert<f8_t>(ck::NumericLimits<half_t>::Max())),
+                abs_tol);
+    // convert QuietNaN fp16 to f8_t and check if it is QuietNaN
+    ASSERT_NEAR(0x80, type_convert<f8_t>(ck::NumericLimits<half_t>::QuietNaN()), abs_tol);
+    // positive fp16 value to fp8 and back, check if holds
+    half_t pos_half = half_t{0.0078125};
+    ASSERT_NEAR(pos_half, type_convert<half_t>(type_convert<f8_t>(pos_half)), abs_tol);
+    // negative fp16 value to fp8 and back, check if holds
+    half_t neg_half = half_t{-0.0156250};
+    ASSERT_NEAR(neg_half, type_convert<half_t>(type_convert<f8_t>(neg_half)), abs_tol);
+}
+
+TEST(FP8, ConvertFP16Stochastic)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-3;
+    // convert 0 fp16 to fp8 and back, check if holds
+    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_sr<f8_t>(half_t{0.0})), abs_tol);
+    // convert minimal fp16 to fp8 and back, check if holds
+    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
+                type_convert<half_t>(f8_convert_sr<f8_t>(ck::NumericLimits<half_t>::Min())),
+                abs_tol);
+    // convert maximal f8_t to fp16 and check if equal to 240.0
+    ASSERT_NEAR(half_t{240.0}, type_convert<half_t>(f8_convert_sr<f8_t>(half_t{240.0})), abs_tol);
+    // convert maximal fp16 to fp8 and back, check if clipped to 240.0
+    ASSERT_NEAR(half_t{240.0},
+                type_convert<half_t>(f8_convert_sr<f8_t>(ck::NumericLimits<half_t>::Max())),
+                abs_tol);
+    // convert QuietNaN fp16 to f8_t and check if it is QuietNaN
+    ASSERT_NEAR(0x80, f8_convert_sr<f8_t>(ck::NumericLimits<half_t>::QuietNaN()), abs_tol);
+    // positive fp16 value to fp8 and back, check if holds
+    half_t pos_half = half_t{0.0078125};
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<f8_t>(pos_half)), abs_tol);
+    // negative fp16 value to fp8 and back, check if holds
+    half_t neg_half = half_t{-0.0156250};
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<f8_t>(neg_half)), abs_tol);
+}
-- 
GitLab


From 645eb2f2a08d1116b98710ec1b22ef4ae25b36b2 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 19 Jun 2023 16:55:03 -0700
Subject: [PATCH 67/71] do not build gemm-gemm and conv-conv examples for
 gfx94* (#761)

* do not build gemm-gemm and conv-conv examples for gfx94*

* do not build gemm-gemm and conv-conv examples on navi
---
 example/31_batched_gemm_gemm/CMakeLists.txt     | 8 ++------
 example/41_grouped_conv_conv_fwd/CMakeLists.txt | 8 ++------
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/example/31_batched_gemm_gemm/CMakeLists.txt b/example/31_batched_gemm_gemm/CMakeLists.txt
index f8d139275..83989bcd9 100644
--- a/example/31_batched_gemm_gemm/CMakeLists.txt
+++ b/example/31_batched_gemm_gemm/CMakeLists.txt
@@ -14,10 +14,6 @@ foreach(gpu IN LISTS GPU_TARGETS)
  endif()
 endforeach()
 
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list2 AND target EQUAL 0)
+if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx1")
    add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
-   set(target 1)
- endif()
-endforeach()
\ No newline at end of file
+endif()
diff --git a/example/41_grouped_conv_conv_fwd/CMakeLists.txt b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
index 0c9df707b..ae251e88d 100644
--- a/example/41_grouped_conv_conv_fwd/CMakeLists.txt
+++ b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
@@ -13,10 +13,6 @@ foreach(gpu IN LISTS GPU_TARGETS)
  endif()
 endforeach()
 
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list2 AND target EQUAL 0)
+if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx1")
    add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
-   set(target 1)
- endif()
-endforeach()
+endif()
-- 
GitLab


From 05ea6452b602c0e3d218042c63713a056aaeb6fe Mon Sep 17 00:00:00 2001
From: zjing14 <zhangjing14@gmail.com>
Date: Mon, 19 Jun 2023 19:24:18 -0700
Subject: [PATCH 68/71] changed pipeline v1 (#763)

---
 .../gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
index 380199341..6c07a5c56 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -76,7 +76,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
     // TODO: should be exposed as Tparams.
     static constexpr index_t NumGemmKPrefetchStage = 1;
     static constexpr LoopScheduler LoopSched       = make_default_loop_scheduler();
-    static constexpr PipelineVersion PipelineVer   = PipelineVersion::v2;
+    static constexpr PipelineVersion PipelineVer   = PipelineVersion::v1;
 
     using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
         BlockSize,
-- 
GitLab


From 32d2f52bf70a4ef80e8399951150c4b3b848feeb Mon Sep 17 00:00:00 2001
From: ltqin <letao.qin@amd.com>
Date: Tue, 20 Jun 2023 10:25:08 +0800
Subject: [PATCH 69/71] remove useless comments (#760)

---
 .../device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp   | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index ca72bcdd2..73b12bfb4 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -786,12 +786,10 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
             if(arg.d0s_nl_ns_lengths_strides_[i][1] == 1 &&
                arg.d0s_nl_ns_lengths_strides_[i][0] % D0sTransferSrcScalarPerVector != 0)
             {
-                std::cout << "first" << std::endl;
                 return false;
             }
             if(arg.d0s_nl_ns_lengths_strides_[i][1] != 1 && D0sTransferSrcScalarPerVector != 1)
             {
-                std::cout << "second" << std::endl;
                 return false;
             }
         }
-- 
GitLab


From 63388e84ab71a08d6aa5b1476c0ba5d04be201e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Wed, 21 Jun 2023 15:20:31 +0200
Subject: [PATCH 70/71] Support bf16/f32/f16 and NHWGC conv2d_bwd_data (#757)

* Support bf16/f32/f16 and NHWGC conv2d_bwd_data

* Add interface test

* clang format

* Comment fixes

* Add more friendly error message
---
 include/ck/ck.hpp                             |   4 +
 ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp |  15 +-
 .../transform_conv_bwd_data_to_gemm_v1.hpp    |  66 ++++-
 .../gpu/grouped_convolution_backward_data.hpp |  99 +++++++
 .../grouped_conv2d_bwd_data/CMakeLists.txt    |   5 +
 ...ta_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp |  47 ++++
 ...ata_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp |  86 ++----
 ...ata_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp |  47 ++++
 ...e_grouped_conv2d_bwd_data_xdl_instance.hpp | 141 ++++++++++
 ...ta_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |  47 ++++
 ...ata_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp |  47 ++++
 ...ata_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp |  47 ++++
 profiler/README.md                            |  38 +++
 .../profile_grouped_conv_bwd_data_impl.hpp    | 257 ++++++++++++++++++
 profiler/src/CMakeLists.txt                   |   2 +
 .../src/profile_grouped_conv_bwd_data.cpp     | 157 +++++++++++
 test/CMakeLists.txt                           |   1 +
 test/grouped_convnd_bwd_data/CMakeLists.txt   |   6 +
 .../test_grouped_convnd_bwd_data.cpp          |  78 ++++++
 ...test_grouped_convnd_bwd_data_interface.cpp | 178 ++++++++++++
 20 files changed, 1285 insertions(+), 83 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_instance.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
 create mode 100644 profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
 create mode 100644 profiler/src/profile_grouped_conv_bwd_data.cpp
 create mode 100644 test/grouped_convnd_bwd_data/CMakeLists.txt
 create mode 100644 test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data.cpp
 create mode 100644 test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface.cpp

diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index d7ce449bb..161c3261a 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -173,6 +173,10 @@
 
 // workaround: compiler issue on gfx908
 #define CK_WORKAROUND_SWDEV_388832 1
+
+// workaround: Grouped Conv2d_bwd_data fails for already implemented instance
+#define CK_WORKAROUND_SWDEV_3318619 0
+
 // flag to enable (1) or disable (0) the debugging output in some kernels
 #define DEBUG_LOG 0
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 81a4d6927..95af93251 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -459,7 +459,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
               p_ds_grid_{},
               p_e_grid_{static_cast<EDataType*>(p_e)},
               num_group_{a_g_n_k_wos_lengths[0]},
-              num_gemm_{},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               cde_element_op_{cde_element_op},
@@ -508,9 +507,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             const auto YTilde = ConvStrideH / GcdStrideDilationH;
             const auto XTilde = ConvStrideW / GcdStrideDilationW;
 
-            // number of GEMM
-            num_gemm_ = YTilde * XTilde;
-
             for(index_t i_ytilde = 0; i_ytilde < YTilde; ++i_ytilde)
             {
                 for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
@@ -626,7 +622,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         void Print() const
         {
-            for(index_t i = 0; i < num_gemm_; i++)
+            for(std::size_t i = 0; i < a_grid_desc_ak0_m_ak1_container_.size(); i++)
             {
                 std::cout << "a_grid_desc_ak0_m_ak1_container_"
                           << a_grid_desc_ak0_m_ak1_container_[i] << std::endl;
@@ -654,7 +650,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         // tensor descriptor for problem definition
         index_t num_group_;
-        index_t num_gemm_;
         std::vector<AGridDesc_M_K> a_grid_desc_m_k_container_;
         std::vector<BGridDesc_N_K> b_grid_desc_n_k_container_;
         std::vector<DsGridDesc_M_N> ds_grid_desc_m_n_container_;
@@ -708,7 +703,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
             float ave_time = 0;
 
-            for(index_t i = 0; i < arg.num_gemm_; i++)
+            for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
             {
                 if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
                                                 arg.b_grid_desc_n_k_container_[i],
@@ -807,7 +802,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         }
 
         // vector load for A matrix from global memory to LDS
-        if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNHWK>)
+        if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNHWK> ||
+                     is_same_v<ALayout, tensor_layout::convolution::NHWGK>)
         {
             if(!(ABlockTransferSrcVectorDim == 2 && ConvK % ABlockTransferSrcScalarPerVector == 0))
             {
@@ -862,7 +858,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         }
 
         // vector store for E
-        if constexpr(is_same_v<ELayout, tensor_layout::convolution::GNHWC>)
+        if constexpr(is_same_v<ELayout, tensor_layout::convolution::GNHWC> ||
+                     is_same_v<ELayout, tensor_layout::convolution::NHWGC>)
         {
             // vector store C matrix into global memory
             if(!(ConvC % CDEBlockTransferScalarPerVector_NPerBlock == 0))
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
index 505ed33d5..59f64bde7 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
@@ -13,6 +13,61 @@
 namespace ck {
 namespace tensor_operation {
 
+namespace {
+template <
+    index_t NDimSpatial,
+    typename ALayout,
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization ConvBwdDataSpecialization>
+constexpr auto
+make_out_n_ho_wo_k_grid_desc(const index_t N,
+                             const index_t Ho,
+                             const index_t Wo,
+                             const index_t K,
+                             const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_strides)
+{
+
+    if constexpr(is_same_v<ALayout, tensor_layout::convolution::NHWGK>)
+    {
+        const index_t NStride  = out_g_n_k_wos_strides[1];
+        const index_t HiStride = out_g_n_k_wos_strides[3];
+        const index_t WiStride = out_g_n_k_wos_strides[4];
+        const auto CStride     = Number<1>{};
+        if constexpr(ConvBwdDataSpecialization ==
+                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                         Filter1x1Stride1Pad0)
+        {
+
+            return make_naive_tensor_descriptor(make_tuple(N * Ho * Wo, K),
+                                                make_tuple(WiStride, CStride));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(make_tuple(N, Ho, Wo, K),
+                                                make_tuple(NStride, HiStride, WiStride, CStride));
+        }
+    }
+    else if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNHWK>)
+    {
+        // assume packed
+        if constexpr(ConvBwdDataSpecialization ==
+                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                         Filter1x1Stride1Pad0)
+        {
+            return make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor_packed(make_tuple(N, Ho, Wo, K));
+        }
+    }
+    else
+    {
+        throw std::runtime_error("wrong! unsupported layout: " + ALayout::name());
+    }
+}
+
+} // namespace
+
 template <
     index_t NDimSpatial,
     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization ConvBwdDataSpecialization,
@@ -29,11 +84,12 @@ struct TransformConvBwdDataToGemm_v1
 
     template <typename ALayout,
               typename std::enable_if<NDimSpatial == 2 &&
-                                          is_same_v<ALayout, tensor_layout::convolution::GNHWK>,
+                                          (is_same_v<ALayout, tensor_layout::convolution::GNHWK> ||
+                                           is_same_v<ALayout, tensor_layout::convolution::NHWGK>),
                                       bool>::type = false>
     static auto MakeADescriptor_AK0_M_AK1(
         const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
-        const std::array<index_t, NDimSpatial + 3>& /* out_g_n_k_wos_strides */,
+        const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_strides,
         const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
         const std::array<index_t, NDimSpatial + 3>& /* wei_g_k_c_xs_strides */,
         const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
@@ -70,9 +126,9 @@ struct TransformConvBwdDataToGemm_v1
 
         const index_t AK0 = K / AK1;
 
-        // assume packed
         const auto out_n_ho_wo_k_grid_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(N, Ho, Wo, K));
+            make_out_n_ho_wo_k_grid_desc<NDimSpatial, ALayout, ConvBwdDataSpecialization>(
+                N, Ho, Wo, K, out_g_n_k_wos_strides);
 
         if constexpr(ConvBwdDataSpecialization ==
                      ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
@@ -80,7 +136,7 @@ struct TransformConvBwdDataToGemm_v1
         {
             // A: output tensor
             const auto out_gemmak0_gemmmraw_gemmak1_grid_desc = transform_tensor_descriptor(
-                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
+                out_n_ho_wo_k_grid_desc,
                 make_tuple(make_pass_through_transform(N * Ho * Wo),
                            make_unmerge_transform(make_tuple(AK0, AK1))),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
index fadfd1995..0b20e19a2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
@@ -30,6 +30,76 @@ void add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  GNHWK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  GNHWC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  GNHWK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  GNHWC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
 template <ck::index_t NumDimSpatial,
           typename OutLayout,
           typename WeiLayout,
@@ -78,6 +148,35 @@ struct DeviceOperationInstanceFactory<
             {
                 add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
             }
+            else if constexpr(is_same_v<InDataType, F32> && is_same_v<WeiDataType, F32> &&
+                              is_same_v<OutDataType, F32>)
+            {
+                add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, BF16> && is_same_v<WeiDataType, BF16> &&
+                              is_same_v<OutDataType, BF16>)
+            {
+                add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(op_ptrs);
+            }
+        }
+        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
+                          is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NHWGK>)
+        {
+            if constexpr(is_same_v<InDataType, F16> && is_same_v<WeiDataType, F16> &&
+                         is_same_v<OutDataType, F16>)
+            {
+                add_device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, F32> && is_same_v<WeiDataType, F32> &&
+                              is_same_v<OutDataType, F32>)
+            {
+                add_device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, BF16> && is_same_v<WeiDataType, BF16> &&
+                              is_same_v<OutDataType, BF16>)
+            {
+                add_device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(op_ptrs);
+            }
         }
 
         return op_ptrs;
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
index 3b2968d48..85ec0f55a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
@@ -1,3 +1,8 @@
 add_instance_library(device_grouped_conv2d_bwd_data_instance
    device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+   device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+   device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+   device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+   device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+   device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
new file mode 100644
index 000000000..01cd23b2e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "device_grouped_conv2d_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[g, n, hi, wi, c] * wei[g, k, y, x, c] = in[g, n, ho, wo, k]
+void add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  GNHWK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  GNHWC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_data_xdl_bf16_instances<GNHWK,
+                                                          GKYXC,
+                                                          Empty_Tuple,
+                                                          GNHWC,
+                                                          ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_data_xdl_bf16_instances<GNHWK,
+                                                          GKYXC,
+                                                          Empty_Tuple,
+                                                          GNHWC,
+                                                          ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index 11babea28..bb0fd36e4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,80 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "device_grouped_conv2d_bwd_data_xdl_instance.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using Empty_Tuple = ck::Tuple<>;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using GNHWC = ck::tensor_layout::convolution::GNHWC;
-using GKYXC = ck::tensor_layout::convolution::GKYXC;
-using GNHWK = ck::tensor_layout::convolution::GNHWK;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-static constexpr auto ConvBwdDataDefault =
-    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
-
-static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
-    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
-
-using device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances = std::tuple<
-    // clang-format off
-        // 1. Default
-        // ##############################################|    NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
-        // ##############################################| Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
-        // ##############################################|        |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
-        // ##############################################|        |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,    64,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   128,    64,    32,   8,   8,   32,   32,       2,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,    32,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,    64,    32,    64,    32,   8,   8,   32,   32,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
-
-        // 2. Filter1x1Stride1Pad0
-        // ##############################################|    NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise|              ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
-        // ##############################################| Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|               DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
-        // ##############################################|        |        |        |            |        |      |      |        |         |            |      |             |             |               |                                 |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
-        // ##############################################|        |        |        |            |        |      |      |        |         |            |      |             |             |               |                                 |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   256,   128,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   128,    64,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   256,   128,    64,    32,   8,   8,   32,   32,       2,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   128,    32,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,    64,    32,    64,    32,   8,   8,   32,   32,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>
-    // clang-format on
-    >;
-
+// Compilation parameters for out[g, n, hi, wi, c] * wei[g, k, y, x, c] = in[g, n, ho, wo, k]
 void add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   GNHWK,
@@ -89,8 +23,22 @@ void add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
                                                                   PassThrough,
                                                                   PassThrough>>>& instances)
 {
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_data_xdl_f16_instances<GNHWK,
+                                                         GKYXC,
+                                                         Empty_Tuple,
+                                                         GNHWC,
+                                                         ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
     add_device_operation_instances(
-        instances, device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances{});
+        instances,
+        device_grouped_conv2d_bwd_data_xdl_f16_instances<GNHWK,
+                                                         GKYXC,
+                                                         Empty_Tuple,
+                                                         GNHWC,
+                                                         ConvBwdDataFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
new file mode 100644
index 000000000..b671017f5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "device_grouped_conv2d_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[g, n, hi, wi, c] * wei[g, k, y, x, c] = in[g, n, ho, wo, k]
+void add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  GNHWK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  GNHWC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_data_xdl_f32_instances<GNHWK,
+                                                         GKYXC,
+                                                         Empty_Tuple,
+                                                         GNHWC,
+                                                         ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_data_xdl_f32_instances<GNHWK,
+                                                         GKYXC,
+                                                         Empty_Tuple,
+                                                         GNHWC,
+                                                         ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_instance.hpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_instance.hpp
new file mode 100644
index 000000000..bfe6e47d1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_instance.hpp
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWGC = ck::tensor_layout::convolution::NHWGC;
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+
+using NHWGK = ck::tensor_layout::convolution::NHWGK;
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault = ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// f16_f16_f32_f16
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv2d_bwd_data_xdl_f16_instances = std::tuple<
+    // clang-format off
+        // ##############################################|    NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################| Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|        |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|        |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                8>
+        
+#ifdef CK_WORKAROUND_SWDEV_3318619
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,    64,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   128,    64,    32,   8,   8,   32,   32,       2,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,    32,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,    64,    32,    64,    32,   8,   8,   32,   32,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+#endif
+    // clang-format on
+    >;
+
+// bf16_bf16_f32_bf16
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv2d_bwd_data_xdl_bf16_instances = std::tuple<
+    // clang-format off
+        // ##############################################|    NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################| Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|        |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|        |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,   BF16,   BF16,     F32,      BF16, Empty_Tuple,   BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                8>
+        
+#ifdef CK_WORKAROUND_SWDEV_3318619
+        //     DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        //     DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        //     DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        //     DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        //     DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
+        //     DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,    64,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        //     DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+        //     DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,    64,    32,   8,   8,   32,   32,       2,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        //     DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        //     DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
+        //     DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,    32,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        //     DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+        //     DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    32,    64,    32,   8,   8,   32,   32,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>
+#endif
+    // clang-format on
+    >;
+
+// f32_f32_f32_f32
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv2d_bwd_data_xdl_f32_instances = std::tuple<
+    // clang-format off
+        // ##############################################|    NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################| Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|        |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|        |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,  F32,  F32,      F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                4>
+
+#ifdef CK_WORKAROUND_SWDEV_3318619
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 8>,                4>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 4>,                4>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,    64,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 8>,                4>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,    64,    32,   8,   8,   32,   32,       2,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 4>,                4>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,    32,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 8>,                4>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+        // DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    32,    64,    32,   8,   8,   32,   32,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+#endif
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
new file mode 100644
index 000000000..0b1fa63dd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "device_grouped_conv2d_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, hi, wi, g, c] * wei[g, k, y, x, c] = in[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_data_xdl_bf16_instances<NHWGK,
+                                                          GKYXC,
+                                                          Empty_Tuple,
+                                                          NHWGC,
+                                                          ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_data_xdl_bf16_instances<NHWGK,
+                                                          GKYXC,
+                                                          Empty_Tuple,
+                                                          NHWGC,
+                                                          ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
new file mode 100644
index 000000000..1c8fc57e1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "device_grouped_conv2d_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, hi, wi, g, c] * wei[g, k, y, x, c] = in[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_data_xdl_f16_instances<NHWGK,
+                                                         GKYXC,
+                                                         Empty_Tuple,
+                                                         NHWGC,
+                                                         ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_data_xdl_f16_instances<NHWGK,
+                                                         GKYXC,
+                                                         Empty_Tuple,
+                                                         NHWGC,
+                                                         ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
new file mode 100644
index 000000000..fcdfe8d42
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "device_grouped_conv2d_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, hi, wi, g, c] * wei[g, k, y, x, c] = in[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_data_xdl_f32_instances<NHWGK,
+                                                         GKYXC,
+                                                         Empty_Tuple,
+                                                         NHWGC,
+                                                         ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_data_xdl_f32_instances<NHWGK,
+                                                         GKYXC,
+                                                         Empty_Tuple,
+                                                         NHWGC,
+                                                         ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/README.md b/profiler/README.md
index 7a1fb2911..3ee039cb3 100644
--- a/profiler/README.md
+++ b/profiler/README.md
@@ -102,4 +102,42 @@ arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
 arg.e_grid_desc_m_n_{ 4096, 4096}
 ....
 Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s
+## Profile grouped convolution backward data kernels
+```bash
+# arg1: tensor operation (grouped_conv_bwd_data: Grouped Convolution Backward Data)
+# arg2: data type (0: Output fp32, Weight fp32, Input fp32
+#                  1: Output fp16, Weight fp16, Input fp16
+#                  2: Output bf16, Weight bf16, Input bf16
+# arg3: tensor layout (0: Output[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Input[G, N, Ho, Wo, K]
+#                      1: Output[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Input[N, Ho, Wo, G, K])
+# arg4: verification (0: no, 1: yes)
+# arg5: initialization (0: no init, 1: integer value, 2: decimal value)
+# arg6: print tensor value (0: no; 1: yes)
+# arg7: time kernel (0: no, 1: yes)
+# Following arguments (depending on number of spatial dims):
+#  Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
+#  G, N, K, C, 
+#  <filter spatial dimensions>, (ie Y, X for 2D)
+#  <input image spatial dimensions>, (ie Hi, Wi for 2D)
+#  <strides>, (ie Sy, Sx for 2D)
+#  <dilations>, (ie Dy, Dx for 2D)
+#  <left padding>, (ie LeftPy, LeftPx for 2D)
+#  <right padding>, (ie RightPy, RightPx for 2D)
+
+ ################                   op   datatype  layout  verify  init  log  time  Ndims  G  N   K   C  Y  X  Hi  Wi  Sy  Sx  Dy  Dx  LeftPy  LeftPx  RightPy  RightPx
+./bin/ckProfiler grouped_conv_bwd_data          1       0       1     1    0     1      2 32  4 192 192  3  3  28  28   1   1   1   1       1       1        1        1
+
+ ```
+
+Result (MI100, FP16, GNHWC_GKYXC_GNHWK)
+```
+out: dim 5, lengths {32, 4, 192, 28, 28}, strides {602112, 150528, 1, 5376, 192}
+wei: dim 5, lengths {32, 192, 192, 3, 3}, strides {331776, 1728, 1, 576, 192}
+in: dim 5, lengths {32, 4, 192, 28, 28}, strides {602112, 150528, 1, 5376, 192}
+....
+Best configuration parameters:
+name: DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 2, Default, 32, 32, 2, 4, 8, 4, 1, 1>
+avg_time: 0.768321
+tflops: 86.6679
+GB/s: 127.947
 ```
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
new file mode 100644
index 000000000..93d3430bb
--- /dev/null
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <ck::index_t NDimSpatial,
+          typename OutLayout,
+          typename WeiLayout,
+          typename InLayout,
+          typename OutDataType,
+          typename WeiDataType,
+          typename InDataType>
+bool profile_grouped_conv_bwd_data_impl(int do_verification,
+                                        int init_method,
+                                        bool do_log,
+                                        bool time_kernel,
+                                        const ck::utils::conv::ConvParam& conv_param)
+{
+    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto out_element_op = OutElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto in_element_op  = InElementOp{};
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    Tensor<OutDataType> out(out_g_n_k_wos_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<InDataType> in_host(in_g_n_c_wis_desc);
+    Tensor<InDataType> in_device(in_g_n_c_wis_desc);
+
+    std::cout << "out: " << out.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "in: " << in_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    case 2:
+        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        break;
+    default:
+        out.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
+        wei.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
+    }
+
+    DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem in_device_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
+
+    out_device_buf.ToDevice(out.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+
+    // reset input to zero
+    in_device_buf.SetZero();
+
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
+                                                                         InDataType,
+                                                                         WeiDataType,
+                                                                         OutDataType,
+                                                                         InElementOp,
+                                                                         WeiElementOp,
+                                                                         OutElementOp>();
+
+        auto ref_invoker = ref_conv.MakeInvoker();
+
+        in_host.SetZero();
+
+        auto ref_argument = ref_conv.MakeArgument(in_host,
+                                                  wei,
+                                                  out,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  out_element_op,
+                                                  wei_element_op,
+                                                  in_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    std::string best_op_name;
+    float best_avg_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    bool pass = true;
+
+    auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init output to zero before profiling next kernel
+            in_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop      = conv_param.GetFlops();
+            std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+            float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
+
+            float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                in_device_buf.FromDevice(in_device.mData.data());
+
+                pass = pass & ck::utils::check_err(in_device, in_host);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "output : ", out.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "weight: ", wei.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "in_host  : ", in_host.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "in_device: ", in_device.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    };
+
+    // do GEMM
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NDimSpatial,
+                                                                                     OutLayout,
+                                                                                     WeiLayout,
+                                                                                     ck::Tuple<>,
+                                                                                     InLayout,
+                                                                                     OutDataType,
+                                                                                     WeiDataType,
+                                                                                     ck::Tuple<>,
+                                                                                     InDataType,
+                                                                                     OutElementOp,
+                                                                                     WeiElementOp,
+                                                                                     InElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::array<ck::index_t, NDimSpatial + 3> out_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> out_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> wei_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> wei_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> in_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> in_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(out_g_n_k_wos_desc.GetLengths(), out_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), out_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), wei_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), wei_strides);
+    copy(in_g_n_c_wis_desc.GetLengths(), in_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), in_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                        {},
+                                        static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                        out_lengths,
+                                        out_strides,
+                                        wei_lengths,
+                                        wei_strides,
+                                        {},
+                                        {},
+                                        in_lengths,
+                                        in_strides,
+                                        conv_filter_strides,
+                                        conv_filter_dilations,
+                                        input_left_pads,
+                                        input_right_pads,
+                                        out_element_op,
+                                        wei_element_op,
+                                        in_element_op);
+
+        run_impl(op_ptr, argument_ptr);
+    }
+
+    std::cout << "Best configuration parameters:"
+              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
+              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 6f768e0ae..3f19096f4 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -35,6 +35,7 @@ set(PROFILER_SOURCES
     profile_contraction_bilinear.cpp
     profile_contraction_scale.cpp
     profile_batched_gemm_multi_d.cpp
+    profile_grouped_conv_bwd_data.cpp
 )
 
 set(PROFILER_EXECUTABLE ckProfiler)
@@ -79,4 +80,5 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool_fwd_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
diff --git a/profiler/src/profile_grouped_conv_bwd_data.cpp b/profiler/src/profile_grouped_conv_bwd_data.cpp
new file mode 100644
index 000000000..351bb72c8
--- /dev/null
+++ b/profiler/src/profile_grouped_conv_bwd_data.cpp
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_grouped_conv_bwd_data_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+namespace {
+
+enum struct ConvLayout
+{
+    GNHWC_GKYXC_GNHWK, // 0
+    NHWGC_GKYXC_NHWGK, // 1
+};
+
+enum struct ConvDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+};
+
+#define OP_NAME "grouped_conv_bwd_data"
+#define OP_DESC "Grouped Convolution Backward Data"
+
+static void print_helper_msg()
+{
+    std::cout
+        // clang-format off
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+        << "arg2: data type (0: Output fp32, Weight fp32, Input fp32\n"
+        << "                 1: Output fp16, Weight fp16, Input fp16\n"
+        << "                 2: Output bf16, Weight bf16, Input bf16\n"
+        << "arg3: tensor layout (0: Output[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Input[G, N, Ho, Wo, K]\n"
+        << "                     1: Output[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Input[N, Ho, Wo, G, K])\n"
+        << "arg4: verification (0: no, 1: yes)\n"
+        << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+        << "arg6: print tensor value (0: no; 1: yes)\n"
+        << "arg7: time kernel (0: no, 1: yes)\n"
+        << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+    // clang-format on
+}
+
+} // namespace
+
+int profile_grouped_conv_bwd_data(int argc, char* argv[])
+{
+    // 8 for control, 1 for num_dim_spatial
+    if(argc < 9)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<ConvLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+    const int num_dim_spatial  = std::stoi(argv[8]);
+
+    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
+    if(argc != 8 + 1 + 4 + 6 * num_dim_spatial)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
+
+    using F32  = float;
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
+
+    using GNHWC = ck::tensor_layout::convolution::GNHWC;
+    using NHWGC = ck::tensor_layout::convolution::NHWGC;
+
+    using GKYXC = ck::tensor_layout::convolution::GKYXC;
+
+    using GNHWK = ck::tensor_layout::convolution::GNHWK;
+    using NHWGK = ck::tensor_layout::convolution::NHWGK;
+
+    constexpr auto I2 = ck::Number<2>{};
+
+    auto profile = [&](auto num_dim_spatial_tmp,
+                       auto out_layout,
+                       auto wei_layout,
+                       auto in_layout,
+                       auto wei_type,
+                       auto out_type,
+                       auto in_type) {
+        constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
+
+        using OutLayout = decltype(out_layout);
+        using WeiLayout = decltype(wei_layout);
+        using InLayout  = decltype(in_layout);
+
+        using OutDataType = decltype(out_type);
+        using WeiDataType = decltype(wei_type);
+        using InDataType  = decltype(in_type);
+
+        bool pass = ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
+                                                                     OutLayout,
+                                                                     WeiLayout,
+                                                                     InLayout,
+                                                                     OutDataType,
+                                                                     WeiDataType,
+                                                                     InDataType>(
+            do_verification, init_method, do_log, time_kernel, params);
+
+        return pass ? 0 : 1;
+    };
+
+    // GNHWC_GKYXC_GNHWK
+    if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, BF16{}, BF16{}, BF16{});
+        }
+    }
+    // NHWGC_GKYXC_NHWGK
+    else if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, BF16{}, BF16{}, BF16{});
+        }
+    }
+
+    std::cout << "this data_type & layout is not implemented" << std::endl;
+
+    return 1;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_conv_bwd_data);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e3385b9dd..ad08e9470 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -59,6 +59,7 @@ add_subdirectory(batchnorm)
 add_subdirectory(contraction)
 add_subdirectory(pool_fwd)
 add_subdirectory(batched_gemm_multi_d)
+add_subdirectory(grouped_convnd_bwd_data)
 if(GPU_TARGETS MATCHES "gfx1100")
     add_subdirectory(wmma_op)
 endif()
diff --git a/test/grouped_convnd_bwd_data/CMakeLists.txt b/test/grouped_convnd_bwd_data/CMakeLists.txt
new file mode 100644
index 000000000..d7c9f4655
--- /dev/null
+++ b/test/grouped_convnd_bwd_data/CMakeLists.txt
@@ -0,0 +1,6 @@
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+    add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data.cpp)
+    target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance)
+    add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface.cpp)
+    target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance)
+endif()
\ No newline at end of file
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data.cpp
new file mode 100644
index 000000000..94808669a
--- /dev/null
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data.cpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "profiler/profile_grouped_conv_bwd_data_impl.hpp"
+
+template <typename Tuple>
+class TestGroupedConvndBwdData : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using OutLayout = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using InLayout  = std::tuple_element_t<3, Tuple>;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
+                                                                            OutLayout,
+                                                                            WeiLayout,
+                                                                            InLayout,
+                                                                            DataType,
+                                                                            DataType,
+                                                                            DataType>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               false, // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+using NHWGC = ck::tensor_layout::convolution::NHWGC;
+
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+using NHWGK = ck::tensor_layout::convolution::NHWGK;
+
+using KernelTypes = ::testing::Types<std::tuple<float, GNHWK, GKYXC, GNHWC>,
+                                     std::tuple<ck::half_t, GNHWK, GKYXC, GNHWC>,
+                                     std::tuple<ck::bhalf_t, GNHWK, GKYXC, GNHWC>,
+                                     std::tuple<float, NHWGK, GKYXC, NHWGC>,
+                                     std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>,
+                                     std::tuple<ck::bhalf_t, NHWGK, GKYXC, NHWGC>>;
+TYPED_TEST_SUITE(TestGroupedConvndBwdData, KernelTypes);
+
+TYPED_TEST(TestGroupedConvndBwdData, Test2D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back(
+        {2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back(
+        {2, 2, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back(
+        {2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->template Run<2>();
+}
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface.cpp
new file mode 100644
index 000000000..bc592ba66
--- /dev/null
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface.cpp
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
+
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using DataType    = ck::half_t;
+using AccDataType = float;
+using Pass        = ck::tensor_operation::element_wise::PassThrough;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using ConvBackwardDataSpecialization =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization;
+
+static constexpr auto ConvBwdDataDefault   = ConvBackwardDataSpecialization::Default;
+static constexpr auto Filter1x1Stride1Pad0 = ConvBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+template <typename Tuple, ConvBackwardDataSpecialization ConvSpec>
+class TestGroupedConvndBwdData : public ::testing::Test
+{
+    protected:
+    static constexpr ck::index_t NDimSpatial = 2;
+
+    using OutLayout = std::tuple_element_t<0, Tuple>;
+    using WeiLayout = std::tuple_element_t<1, Tuple>;
+    using InLayout  = std::tuple_element_t<2, Tuple>;
+
+    // clang-format off
+    using GroupedConvBwdDataDeviceInstance = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
+    // ######| NDimSpatial|   ALayout|   BLayout|   DsLayout|  ELayout|       AData|       BData|     AccData|         CShuffle|       DsData|      EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+    // ######|            |          |          |           |         |        Type|        Type|        Type|         DataType|         Type|       Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+    // ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+    // ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+    < NDimSpatial, OutLayout, WeiLayout, ck::Tuple<>, InLayout, DataType, DataType, AccDataType, DataType,   ck::Tuple<>, DataType, Pass, Pass,    Pass,  ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                8>;
+    // clang-format on
+
+    ck::utils::conv::ConvParam conv_param;
+
+    template <ck::index_t NDimSpatial>
+    bool Run()
+    {
+
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        std::array<ck::index_t, NDimSpatial + 3> out_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> out_strides{};
+        std::array<ck::index_t, NDimSpatial + 3> wei_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> wei_strides{};
+        std::array<ck::index_t, NDimSpatial + 3> in_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> in_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+        std::array<ck::index_t, NDimSpatial> input_left_pads{};
+        std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+        auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+        copy(out_g_n_k_wos_desc.GetLengths(), out_lengths);
+        copy(out_g_n_k_wos_desc.GetStrides(), out_strides);
+        copy(wei_g_k_c_xs_desc.GetLengths(), wei_lengths);
+        copy(wei_g_k_c_xs_desc.GetStrides(), wei_strides);
+        copy(in_g_n_c_wis_desc.GetLengths(), in_lengths);
+        copy(in_g_n_c_wis_desc.GetStrides(), in_strides);
+        copy(conv_param.conv_filter_strides_, conv_filter_strides);
+        copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+        copy(conv_param.input_left_pads_, input_left_pads);
+        copy(conv_param.input_right_pads_, input_right_pads);
+
+        auto conv = GroupedConvBwdDataDeviceInstance{};
+
+        auto argument = conv.MakeArgument(nullptr,
+                                          nullptr,
+                                          std::array<const void*, 0>{},
+                                          nullptr,
+                                          out_lengths,
+                                          out_strides,
+                                          wei_lengths,
+                                          wei_strides,
+                                          {},
+                                          {},
+                                          in_lengths,
+                                          in_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          Pass{},
+                                          Pass{},
+                                          Pass{});
+        return conv.IsSupportedArgument(argument);
+    }
+};
+
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+using NHWGC = ck::tensor_layout::convolution::NHWGC;
+
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+using NHWGK = ck::tensor_layout::convolution::NHWGK;
+
+using KernelTypes =
+    ::testing::Types<std::tuple<GNHWK, GKYXC, GNHWC>, std::tuple<NHWGK, GKYXC, NHWGC>>;
+
+template <typename Tuple>
+class TestGroupedConvndBwdDataDefault : public TestGroupedConvndBwdData<Tuple, ConvBwdDataDefault>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndBwdDataFilter1x1
+    : public TestGroupedConvndBwdData<Tuple, Filter1x1Stride1Pad0>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataDefault, KernelTypes);
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataFilter1x1, KernelTypes);
+
+TYPED_TEST(TestGroupedConvndBwdDataFilter1x1, SpecializationCheck)
+{
+    // Check filter 3,3 instead of 1,1
+    this->conv_param  = {2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+    bool is_supported = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+
+    // Check strides 2,2 instead of 1,1
+    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
+    is_supported     = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+
+    // Check with pad
+    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}};
+    is_supported     = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+
+    // Supported version
+    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+    is_supported     = this->template Run<2>();
+    EXPECT_TRUE(is_supported);
+}
+
+TYPED_TEST(TestGroupedConvndBwdDataDefault, VectorLoadCheck)
+{
+    // vector load for A
+    this->conv_param  = {2, 2, 128, 129, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
+    bool is_supported = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+    // vector load for B, E, Ds
+    this->conv_param = {2, 2, 128, 128, 257, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
+    is_supported     = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+}
-- 
GitLab


From 3b18f1e38cd7dc5a621d4fb4e1ac82304583db28 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 21 Jun 2023 10:47:35 -0700
Subject: [PATCH 71/71] do not build gfx941/942 targets during CI (#766)

---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index ad2baa00e..fbff349fc 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -696,7 +696,7 @@ pipeline {
                     agent{ label rocmnode("gfx908 || gfx90a") }
                     environment{
                         setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940" """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
                     }
                     steps{
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
-- 
GitLab