diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 8ccfe99c3cc73b643f8b92cb654005e54c0774bd..07d83688176bd771142a8698fda492c538c691b1 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -4,7 +4,7 @@ This is the list of developers and contributors to Composable Kernel library
 
 
 ## Developers
-[Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2022
+[Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2023
 
 [Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2022
 
diff --git a/Dockerfile b/Dockerfile
index 8e6ddb1eba3dd934222fa4bd571729db6762dc69..710db05c0628600f4ae560eca2ec28652cd4d580 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 FROM ubuntu:20.04
-
+ARG DEBIAN_FRONTEND=noninteractive
 ARG ROCMVERSION=5.6
 ARG compiler_version=""
 ARG compiler_commit=""
@@ -9,23 +9,30 @@ RUN set -xe
 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
 RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
 # Add rocm repository
+RUN chmod 1777 /tmp
 RUN apt-get update
-RUN apt-get install -y wget gnupg curl
-RUN --mount=type=ssh if [ "$ROCMVERSION" != "5.6"]; then \
-	wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
+RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
+RUN --mount=type=ssh if [ "$ROCMVERSION" != "5.6" ]; then \
+        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
         sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"; \
-    else sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \
+    elif [ "$ROCMVERSION" = "5.6" ] && [ "$compiler_version" = "" ]; then \
+         sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \
          apt update && apt-get install -y ./amd-nonfree-radeon_20.04-1_all.deb && \
-         amdgpu-repo --amdgpu-build=1567752 --rocm-build=compute-rocm-dkms-no-npi-hipclang/11914 && \
-         DEBIAN_FRONTEND=noninteractive amdgpu-install -y --usecase=rocm ; \
+         amdgpu-repo --amdgpu-build=1567752 --rocm-build=compute-rocm-dkms-no-npi-hipclang/11914; \
+    elif [ "$ROCMVERSION" = "5.6" ] && [ "$compiler_version" = "rc3" ]; then \
+         sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.6-20.04-1_all.deb" && \
+         apt update && apt-get install -y ./amdgpu-install-internal_5.6-20.04-1_all.deb && \
+         sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.6 rel-45  > /etc/apt/sources.list.d/rocm-build.list' && \
+         amdgpu-repo --amdgpu-build=1602498; \
     fi
+RUN amdgpu-install -y --usecase=rocm --no-dkms
+
 RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
 RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
 
 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
-    apt-utils \
     build-essential \
     ccache \
     cmake \
@@ -38,16 +45,11 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     libpthread-stubs0-dev \
     llvm-amdgpu \
     pkg-config \
-    python \
     python3 \
-    python-dev \
     python3-dev \
     python3-pip \
     sshpass \
     software-properties-common \
-    rocm-dev \
-    rocm-device-libs \
-    rocm-cmake \
     vim \
     nano \
     zlib1g-dev \
diff --git a/LICENSE b/LICENSE
index 2fe9a8455efaeda2eab474b2aa038ec2d9e76841..e03fddaf78080705d26ec277629cfb8010c077bf 100644
--- a/LICENSE
+++ b/LICENSE
@@ -7,7 +7,7 @@ Copyright (c) 2020     , Advanced Micro Devices, Inc. (Xiaoyan Zhou)
 Copyright (c) 2021-2022, Advanced Micro Devices, Inc. (Jianfeng Yan)
 
 SPDX-License-Identifier: MIT
-Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/client_example/01_gemm/gemm.cpp b/client_example/01_gemm/gemm.cpp
index ba7118ba3929e3e3bcdf02e40044748860bfeebe..c37f208db1cee9bfff1fff469fd79d059fb179f0 100644
--- a/client_example/01_gemm/gemm.cpp
+++ b/client_example/01_gemm/gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
index 08f297f58a8d522aec8c991a8d38484ccf8e420c..756889562e84c66efb5f972621bcb61edda3af82 100644
--- a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
index 658c1e9e8fcbeab1ec7be4115d5a7d62c5e77283..5965e9d1d63504316f65c0f6fe33d71382eeac15 100644
--- a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
index ea269545a5cc576f7cf98ea8496acf713b6aaea6..319fdb0b03438d9c506a3bc00b4846695bc1519a 100644
--- a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
index caa6573788d201b9fda605e5388423d5966d41ec..1129dfa6b4d7111909e4c2b95b03d75f610def2f 100644
--- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp b/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp
index d4f0c2048ba81c3a1f98a22c5eb57654b0498806..3d5fb6004844af269f7786ae05de8c20cc720633 100644
--- a/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/04_contraction/contraction_bilinear_fp32.cpp b/client_example/04_contraction/contraction_bilinear_fp32.cpp
index 91dead41a4cac19db857b99a233839e9e6647c57..89f834b9824e134f8f0aeed8aa54f78a5c8824a3 100644
--- a/client_example/04_contraction/contraction_bilinear_fp32.cpp
+++ b/client_example/04_contraction/contraction_bilinear_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <numeric>
diff --git a/client_example/04_contraction/contraction_bilinear_fp64.cpp b/client_example/04_contraction/contraction_bilinear_fp64.cpp
index 9238e4cd80075a2caf47d8757a24f7cf82c4b8bd..1aa3ba7de597a0b97a295b0f7ee7ad21b1e9cd80 100644
--- a/client_example/04_contraction/contraction_bilinear_fp64.cpp
+++ b/client_example/04_contraction/contraction_bilinear_fp64.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <numeric>
diff --git a/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp b/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp
index 62be3377a2fb18bf388b857ecb758c0b7987871c..f8ea2258c2ba262e9db42f1b4dc92ff16cdc6286 100644
--- a/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp
+++ b/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <numeric>
diff --git a/client_example/04_contraction/contraction_scale_fp32.cpp b/client_example/04_contraction/contraction_scale_fp32.cpp
index 4e08ee19cdb098b2dfb70a662d59c87008400123..ba7b0633c33aabeeb06547edfb08f506e637e599 100644
--- a/client_example/04_contraction/contraction_scale_fp32.cpp
+++ b/client_example/04_contraction/contraction_scale_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <numeric>
diff --git a/client_example/04_contraction/contraction_scale_fp64.cpp b/client_example/04_contraction/contraction_scale_fp64.cpp
index 3c36aa21eb6c34df75f765cce894d4f137d4f080..24e52eb5aa423339ff96ad0914dc479d715fe7b7 100644
--- a/client_example/04_contraction/contraction_scale_fp64.cpp
+++ b/client_example/04_contraction/contraction_scale_fp64.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <numeric>
diff --git a/client_example/05_layernorm/layernorm2d.cpp b/client_example/05_layernorm/layernorm2d.cpp
index 856a4cc21935f094bfd7040ea095fb04d78b7eb4..4af4d7abe8ee6c5120f6060c78141021052cb619 100644
--- a/client_example/05_layernorm/layernorm2d.cpp
+++ b/client_example/05_layernorm/layernorm2d.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/06_softmax/softmax4d.cpp b/client_example/06_softmax/softmax4d.cpp
index e939ce8dfedb10166b15388cae1d659d33e2154a..987ac95690adf870ae84f1511a519a0f62a70279 100644
--- a/client_example/06_softmax/softmax4d.cpp
+++ b/client_example/06_softmax/softmax4d.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <functional>
 #include <numeric>
diff --git a/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp b/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp
index 9fbdb83b1cf10082958730e5115f3bf54e12d415..70be0101c6d92512ab28a72797d2b8c46fb55281 100644
--- a/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp
+++ b/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iomanip>
diff --git a/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp b/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
index 0a798be270fe4148f904f35df3550be552e294ff..57a210fa1f5f0d3f0eca014e08ccd119d29e4fd6 100644
--- a/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
+++ b/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iomanip>
diff --git a/client_example/08_fused_attention/fused_attention.cpp b/client_example/08_fused_attention/fused_attention.cpp
index fe927da1248786a4b943f610ce38b75f0d88defd..df6bc11a70d32df221612466a8af0fbcd9cafb1c 100644
--- a/client_example/08_fused_attention/fused_attention.cpp
+++ b/client_example/08_fused_attention/fused_attention.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/client_example/08_fused_attention/fused_attention_bias.cpp b/client_example/08_fused_attention/fused_attention_bias.cpp
index 3113b7856025af74c610981db130a8d965c36a24..6c9f3bc8f6f5a3c06f339f1246b5b3985e11d2d8 100644
--- a/client_example/08_fused_attention/fused_attention_bias.cpp
+++ b/client_example/08_fused_attention/fused_attention_bias.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
index 43a4779f5ff0db4558b384ced53f3bfa78ca9c4b..cd504e942e943b8f174442554eca379cd908ba6e 100644
--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
index 2ff91fe966ed39181b3611e2a5cf1840fc273121..f4aa3666b1c15eccd37bdef549f8402bcd1b252b 100644
--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
index 6ea5dd2231b905af6572ec7f7f5cc07ff3edb27e..ebdbbf52c0ca257dd8f672a48b32f80fa5bb8816 100644
--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
index 33407c9a1cef1f56591c11a135e7be816cf18d6c..9d60baee06fddb27451f1f51642bad738739c4bb 100644
--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
index 6f51124707f15fedef12797d96085f2659612ce4..dd81d9ee6b6dc0ee453cac61d7e795c79ba3c9fa 100644
--- a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
index 6a11f9fc26a7e6fd0f2fc7022058c4a72f6a6610..9c088a21d38e7f62c8eee35bf4c557a0bb2209bd 100644
--- a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/09_quantization/gemm_quantization.cpp b/client_example/09_quantization/gemm_quantization.cpp
index 242504b44ff942dcefdbf6cf7ffdc7d6b5b2bbdb..b14e68fa082f8f9d05ab5f00471a2fa82d1d113b 100644
--- a/client_example/09_quantization/gemm_quantization.cpp
+++ b/client_example/09_quantization/gemm_quantization.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp b/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
index 55c789804230ccccf66d68be9244c5c4111451e6..1b2e8abc201c2aed2cd2eebccb68405a25033a43 100644
--- a/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
+++ b/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iomanip>
diff --git a/client_example/11_grouped_conv_bwd_weight/common.hpp b/client_example/11_grouped_conv_bwd_weight/common.hpp
index a906263333c8a15947e12fe8702e431d4acb7999..62eb7bcf553f2ddd17181ce6697dcda70ae9cad6 100644
--- a/client_example/11_grouped_conv_bwd_weight/common.hpp
+++ b/client_example/11_grouped_conv_bwd_weight/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iomanip>
diff --git a/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp b/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
index de68f46d398958917e49bf14178f66414590ed86..bc4a6fe0bfa9e118bbd6ba32ecc7dd68f3b8b2c3 100644
--- a/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
+++ b/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp b/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
index 8ef21986a4d9a5f1eb25e21a1073c4cc341da88d..c0140f71c15993da2e0d4b629321a355c74f004a 100644
--- a/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
+++ b/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <functional>
 #include <numeric>
diff --git a/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp b/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
index 322667a46bacae8d0c681939c3890ef9ff476b0e..365373343668409f130f642d8e217cfd48a9afac 100644
--- a/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
+++ b/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <functional>
 #include <numeric>
diff --git a/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp b/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp
index 3117d162db71a0a4d02a15decac20e1f0f50d56e..5e6627ce14d113224c7b0acb4fea69cb36c1f369 100644
--- a/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp
+++ b/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <functional>
 #include <numeric>
diff --git a/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp b/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp
index 9cfeee1cfe106e69f83dc0184f3956d6751a2947..d45782d8e0ff37027a204c6820447286581f1138 100644
--- a/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp
+++ b/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <functional>
 #include <numeric>
diff --git a/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp b/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
index 28524a9eee9b87db4223484497f8e2181ed366ea..c74d7c6bd8cf9cec54168792a8718768827d7b33 100644
--- a/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
+++ b/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/15_reduce/reduce_nhwc_c.cpp b/client_example/15_reduce/reduce_nhwc_c.cpp
index 2275158bcb26d36d871c9e7086b45d4539584785..b45b72f0de0199daa88e9e42be320e2669398dfe 100644
--- a/client_example/15_reduce/reduce_nhwc_c.cpp
+++ b/client_example/15_reduce/reduce_nhwc_c.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <functional>
 #include <numeric>
diff --git a/client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp b/client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp
index 223ed29be9ac66e8805d8d5b3e696bbac4f5c922..7ba3224fc3244ab3bd472b98864efa42ae132fde 100644
--- a/client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp
+++ b/client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <iostream>
diff --git a/client_example/18_groupnorm/groupnorm_swish.cpp b/client_example/18_groupnorm/groupnorm_swish.cpp
index 84f62ceac437cb1cf950388e17a2af4487dda18f..308061a324956f11b0c10dc77ff70d1d20b47787 100644
--- a/client_example/18_groupnorm/groupnorm_swish.cpp
+++ b/client_example/18_groupnorm/groupnorm_swish.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/19_pool_fwd/avg_pool3d_fwd.cpp b/client_example/19_pool_fwd/avg_pool3d_fwd.cpp
index 47bd7738f69189b95541a0722c7c1b43914fd98b..2edaf474b5644a0a91fb51e675e1c37286841967 100644
--- a/client_example/19_pool_fwd/avg_pool3d_fwd.cpp
+++ b/client_example/19_pool_fwd/avg_pool3d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/client_example/19_pool_fwd/max_pool2d_fwd.cpp b/client_example/19_pool_fwd/max_pool2d_fwd.cpp
index 12ee6192012c8b6114ecade29c0580731e0cb90d..c776dc12da391de21daa2cf464d2bde39873db9f 100644
--- a/client_example/19_pool_fwd/max_pool2d_fwd.cpp
+++ b/client_example/19_pool_fwd/max_pool2d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iomanip>
 #include <vector>
diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp
index 495a8159623beb22d6002bb2e1667ef459b74139..144c9aaccd7f8b1e20e30e671be4be2525ee1d03 100644
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/01_gemm/gemm_dl_fp16.cpp b/example/01_gemm/gemm_dl_fp16.cpp
index cf585a8c51cbb9b5b0218228e4aa706189598804..b5fecb97521bd2d4213a9435b01dafe45a43be6e 100644
--- a/example/01_gemm/gemm_dl_fp16.cpp
+++ b/example/01_gemm/gemm_dl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_dl_fp32.cpp b/example/01_gemm/gemm_dl_fp32.cpp
index 93f085cdee53667a5b906cfed3b037d57d00bf5f..212b72f2a6a060a602c5c640709da77684c7c55e 100644
--- a/example/01_gemm/gemm_dl_fp32.cpp
+++ b/example/01_gemm/gemm_dl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_dl_int4.cpp b/example/01_gemm/gemm_dl_int4.cpp
index e392c490f29a48da3a6424f46876e220306f2907..e55ae140130c0779c1e81e0bdc84b6724c6bac86 100644
--- a/example/01_gemm/gemm_dl_int4.cpp
+++ b/example/01_gemm/gemm_dl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 #error Should compile this file with ck::int4_t support
diff --git a/example/01_gemm/gemm_dl_int8.cpp b/example/01_gemm/gemm_dl_int8.cpp
index be9e387718f120fb1ba708d374f0ff9a09fc806d..1840390aa9e02e85c88328baf358546db362ab24 100644
--- a/example/01_gemm/gemm_dl_int8.cpp
+++ b/example/01_gemm/gemm_dl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp
index 58f965be8817bb5761dd677a52966d21c1bc1024..b11fe76ab2ce031708be7addd7f84be0f15adc6b 100644
--- a/example/01_gemm/gemm_wmma_fp16.cpp
+++ b/example/01_gemm/gemm_wmma_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp
index 9aaae6ade9564aa54bc4ea0c7f2c96aac89ff505..3cac55ef4702856dd08dc223c02ea25113dbbf32 100644
--- a/example/01_gemm/gemm_xdl_bf16.cpp
+++ b/example/01_gemm/gemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp
index 50d35fd9ac98de821fe4861b2dbf34907d3592eb..54fbd9cdd4983c1a810f3c8a1ac1521b363af690 100644
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index 99253b743d58707707c9765c18e5933cb09e4220..8361576299c3da3727cc467387e9717fceb93ce3 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_xdl_int4.cpp b/example/01_gemm/gemm_xdl_int4.cpp
index 7f1283a47b36c94be9645abd9fbd63094293f46e..f6238c7aa5040d8e409a139fe8c144835282cdc9 100644
--- a/example/01_gemm/gemm_xdl_int4.cpp
+++ b/example/01_gemm/gemm_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 #error Should compile this file with ck::int4_t support
diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp
index e67594c5bcbd601ae4747ac7720dab66b605ba2c..cc03200b9d153e8e22e13e6c77a52ad76ed79174 100644
--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
index 12a69925977b7eef2e5aa178016a7d433e9c0781..3afd0ebdb955dbcf949cfa2467a85cdf325c5f4f 100644
--- a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
index 3a0ddd90b700f7086b44a8b0f4e5b9d7385d0ad2..d7176f75daa65da438565fc6c7c3923873485a6d 100644
--- a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc
index 4e2cedb52adc8c7030167ec7f171d91f0796f234..38c72afc60c693ff25fbf8d6679ea7f22ebfdb13 100644
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
index ff99bf4641115e466775995de2761ba52897783d..877792d7409f48ee562cded6629f7e1bbde650d9 100644
--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
index 917b6b1c3142ec8cf7d1c852ea59494fea1842a9..c3e6ef7d5df2e12de69e2f9465f8f3e3709c0859 100644
--- a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
index aee51d05de58b6386a5dd267d45a7e5c12d98276..dffeff23374b6f868fcce952ef8991ca289f59b7 100644
--- a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
+++ b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/04_gemm_add_add_fastgelu/common.hpp b/example/04_gemm_add_add_fastgelu/common.hpp
index 839587c148956cf186e1ba5ac286a1b73797fade..91d17df95fc470ea23011999d0a929b9331d0cc2 100644
--- a/example/04_gemm_add_add_fastgelu/common.hpp
+++ b/example/04_gemm_add_add_fastgelu/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
index ba0476b9b9e3bc031cb0ea61eda66970c2544f39..e630f6783713517977f63a9a2a1ef20836af2c61 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
index b940bfd89737ff4f460ef2b672a99a6696d57ce3..71f6677bae95d1779c76219f6c8b5bc05c3c1d31 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
index fa651a34ea86298a60dbea6158c356b9e50fdfce..4665c3932f419c837125ea3153437302cb9dc022 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp
index 9f9c423de278b8687816dac1d518f843eb2eb34a..f206bbeb411bb0de0144015d70a892b5587d6f0e 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 #error Should compile this file with ck::int4_t support
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
index fadc4ef5ee47737bb33e19f1bab3264f99de5ea2..e46483ab38ac34cb5d4f97602474887d78779327 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_common.hpp b/example/09_convnd_fwd/convnd_fwd_common.hpp
index 4c594ccdf817a069918b348d7b23c6cc533bde28..109b8f9ee34096601f0dd485db6840e0674c1937 100644
--- a/example/09_convnd_fwd/convnd_fwd_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/example/09_convnd_fwd/convnd_fwd_dl_common.hpp b/example/09_convnd_fwd/convnd_fwd_dl_common.hpp
index 855710b9d9a30ec47c74a6f31d1b03ade688e28e..aeddd4fc59fc521a196487bf7c0ea4e88bf6e1c8 100644
--- a/example/09_convnd_fwd/convnd_fwd_dl_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp b/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
index db5a7f0bc3351c9f1670993942a54668c191e56f..7b6f18f46a19c9f0d7c99093fa180370acfc39c0 100644
--- a/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_dl_common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp b/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
index 964d784c8592b2e94bc38cc8eab6b7a645d88fc1..551655b17739b6d7250ddc447482c4141948f6fd 100644
--- a/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_dl_common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp b/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
index b0cd88f214c8550c018ad9f4d268370dbd816c7f..27a3f2e2a9eacc7fadfce307d44f3ee9fd856585 100644
--- a/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_dl_common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
index d55d3154916b0f4ba2fee195b7957904ae3d3139..74cf91d16017bd24096dac6348e5a8815856a51a 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
index d84afba6426b1abe4b10a3b4da1fefe7c2e9272a..f6d69bafd48a4b8bff968c2fb8600793b1310294 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
index f5acc540cf98b8876caa0d5c0812436dd200deb8..6c3171f6157ac4a83a488af12089a97accfd2c7c 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
index 8d697976abd43e5d07c49ec4e0819cbdf15ac02b..9977a496d229876c083e0afe99e17ff0fa5fca05 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
index 99f7f2565c748a950841dffa12a1d343df0051ab..bf084b3cc0b6ceab326b2b21e963e0ce2d4bdcad 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_fwd_common.hpp"
 
diff --git a/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc b/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
index 697ada14ba960f7231db3aa0d5b2482ca38578b8..6474df1c355bc059af5429f96616b2192f46f292 100644
--- a/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
+++ b/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/09_convnd_fwd/run_convnd_fwd_example.inc b/example/09_convnd_fwd/run_convnd_fwd_example.inc
index 36a68056f1d7a34b4309a06941f0ff1f477eaba8..49852ff6678f7c3fbbfce4ac8b1303876092cf40 100644
--- a/example/09_convnd_fwd/run_convnd_fwd_example.inc
+++ b/example/09_convnd_fwd/run_convnd_fwd_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
index 00e370f2968df0409155d5407066e863574613ce..137b0d1ff0fb0a7cb7159d52f54b4cd3496629ae 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cassert>
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp
index 6ff29b4b0ff0f11d6c9d3becbd3aaafdccf3020e..4ccacb0bcee93e72eb1fd23203e9404f1c23c478 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp
index 02c19c2b63bf4f9ee663140b414c89f6221cff51..bf495725e8e77a37019d1d858bf3fe3b69e828d1 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
index 679bb5c0c45a1e2e74fc56d21ee6f7f73cae76ad..5848785673c340149d622fd782671d5fb174edab 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp
index abdbdaf74d5c0bd2b87da17d315e8712cd896d52..bf7127502faac94e6858d37c389f316b7ead35a6 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 #error Should compile this file with ck::int4_t support
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp
index cf86afa8e94957c01fc92acb5ed2286fcb52466c..3e1694cbe8cae679ef249b6654efc40f8b1fc1ab 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
index b3a3891781766acd824bc1974b88f57fdd85b711..cebfeb51d63eac30243d8e3b0468b821f6fc1eb3 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp
index a7ee9990c1941a73b632f3cc1d32b14a00897cc2..9a736d4cfac9d1062c568e795592711e30d18586 100644
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <initializer_list>
diff --git a/example/12_reduce/reduce_blockwise_impl.hpp b/example/12_reduce/reduce_blockwise_impl.hpp
index e6e3cc8d52bc9fe6568d98194e644cf75ad1950a..7f8394a7301cc3da8271e1e8c41482d0b06a2859 100644
--- a/example/12_reduce/reduce_blockwise_impl.hpp
+++ b/example/12_reduce/reduce_blockwise_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp
index dbb18a0d83f62d27750c7797842af9bea0eb1383..eb8b5c76d31a1a0fc7a1a7e085416f80be9abf7f 100644
--- a/example/12_reduce/reduce_blockwise_two_call.cpp
+++ b/example/12_reduce/reduce_blockwise_two_call.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/12_reduce/reduce_example_common.hpp b/example/12_reduce/reduce_example_common.hpp
index 05f0a0edb25ba520d475522bd2963bc050ccc010..5f9a48804a7268cef5a4a065d0c875565bdb3e9e 100644
--- a/example/12_reduce/reduce_example_common.hpp
+++ b/example/12_reduce/reduce_example_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/12_reduce/reduce_multiblock_atomic_add.cpp b/example/12_reduce/reduce_multiblock_atomic_add.cpp
index c4d63a3add8b0d8016b11b8834a1385dcca5beb5..120e3f05957fdb9f937b290dfe09458db248f9c0 100644
--- a/example/12_reduce/reduce_multiblock_atomic_add.cpp
+++ b/example/12_reduce/reduce_multiblock_atomic_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <initializer_list>
diff --git a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
index 905242fb6b5bca3c0dd7209a5d2827546036695e..fed62186448d58ce58c2a13c6dd184e205801185 100644
--- a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
+++ b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/13_pool2d_fwd/pool2d_fwd_common.hpp b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
index 9abc98671389a3ebbadab0924d4334a6926feef5..1157ccd38705b92c759a6de1181fe2b135425091 100644
--- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
index 20c3e4701756d61ee7332f9c6d5c4b0a2be9b8f6..daf8540d4936806904b8d2630f60fa9c094b1b79 100644
--- a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
index 34ff6f435ee0d9d5b80b7900ee065320e2a6d989..323e3f61f8490e1d1278a025dfb878ea515b2635 100644
--- a/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp b/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
index 044f3c166a3f062760782167f3b7b4177f1693bd..2585072dfed88f569f983cc3e563c9deef8036be 100644
--- a/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp b/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
index d5f4e6f62c3eb8e58285e81446a5e1332c37c533..aa3e0116954e603ed5cd759820ea49184bb4cb8a 100644
--- a/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp b/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
index 2371737382447d666d55633fbf3ac976c3017cf7..4b207df5c628976c54d8b9c0f53ea140d51c98e5 100644
--- a/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp
index a5c51ceb0cfbc1a19215c8a2313b12a3b4d5ec51..3e1f7f089371b17fb02ebca53b27b0860c7b9c72 100644
--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cstddef>
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
index 05d572a1f532446bdc0ec43e59b1eeadfe4bd054..680cee1f814db7cef177e87e4d9c03ba745b8e26 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
index 3f78dafa8977b1b57592500138dd8f034b294efd..90a12bc1ddbf27e04a55de23e3fc6845868dac43 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
index fd93bb5f87d49a96da8973c3f862b8b48bdeb180..28b0fcd0cea6d6aed46ffeabc35c4c5ebc72b864 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
index faf41bbf0bba8e06fee9982a2f158706401ca45d..60c4a71a35146c31d9b79703faf5d3422942d817 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
index 7cb09778c521c9dfaed38c39ee5cb7fcff7bbc26..0c96ef56d3c144f19dcb4c7101a8d481d8960ee6 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
index a89937b2e2e95c6c8f03587e429bd72129825a48..743ab96be6291e617cb012957865a6897bd61a91 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
index eb3832a668ff9829ee76e87e6ea8988b40e6980b..2f6533d4481a2f339417a19f0f290f7fd306ee9c 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
index e1248002f751abce96072ee64283ca00022a8713..b28e7f85d3137915a2fe438b16aef5447b2d4d24 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
index c2feffeb8953df030ad2e30cb942bec529b2cac1..b30ce2c48ad32d99dc5c8a4aaf7a3ee9ea774baf 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
index 363390add3e99c96ef3891bec995cda5f8c3e24b..31e2efd6f635e30e569d97f73b4a5967b4909b82 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
index de6b7eb480b30d4347018da94ec518ff29fdbeec..d3c7c1d99c06a222bcefae42d94cdbb6fc8e1239 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
index 9666fc6622cdf3ab7cbab7071ebffd69d18b7668..9a4a6bc6e11a657b9d1eeaa35dd92552e397130d 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
index 00e0b767a45dd979868b81635a79600d4695c779..1a8457a8bf87cdc82e687246014d2cba73065b8f 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
index 652c0e6ea6d2bbe3321de2deefb2c186d7252023..5c2706c79ace02e72f2956270c322c4b00782ebf 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
index 7eee24fed83988698e0e6d46f830f3a9dfdb0257..c119e243702a27df9bba2d6cf3f791f523c98d7d 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
index c250b996928dd406bde9253ccbf40e6cd9d77347..0f5e588383abf30064d27e452131ceef1a7ad828 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_reduce_xdl_common.hpp"
 
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp b/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
index 62992de59765d3e05d935ac69894c4550a7ef472..1bea1bcf3e0fc1494e07952020db175a1d91bfbf 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <numeric>
 #include <initializer_list>
diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
index 26fa9e9821fc050c9ab69fe1a170b4c72a98dcd3..4a9d16c5c303e43989c1f32e51c2cbce6f279e5d 100644
--- a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -81,32 +81,33 @@ int run_conv_bwd_data(bool do_verification,
     in_device_buf.SetZero();
 
     // do GEMM
-    auto conv     = DeviceConvNdBwdDataInstance{};
-    auto invoker  = conv.MakeInvoker();
-    auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-                                      static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                                      static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                      conv_param.N_,
-                                      conv_param.K_,
-                                      conv_param.C_,
-                                      conv_param.input_spatial_lengths_,
-                                      conv_param.filter_spatial_lengths_,
-                                      conv_param.GetOutputSpatialLengths(),
-                                      conv_param.conv_filter_strides_,
-                                      conv_param.conv_filter_dilations_,
-                                      conv_param.input_left_pads_,
-                                      conv_param.input_right_pads_,
-                                      in_element_op,
-                                      wei_element_op,
-                                      out_element_op);
-
-    if(!conv.IsSupportedArgument(argument))
+    auto conv    = DeviceConvNdBwdDataInstance{};
+    auto invoker = conv.MakeInvoker();
+    auto argument =
+        conv.MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                 static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                 static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                 conv_param.N_,
+                                 conv_param.K_,
+                                 conv_param.C_,
+                                 conv_param.input_spatial_lengths_,
+                                 conv_param.filter_spatial_lengths_,
+                                 conv_param.GetOutputSpatialLengths(),
+                                 conv_param.conv_filter_strides_,
+                                 conv_param.conv_filter_dilations_,
+                                 conv_param.input_left_pads_,
+                                 conv_param.input_right_pads_,
+                                 in_element_op,
+                                 wei_element_op,
+                                 out_element_op);
+
+    if(!conv.IsSupportedArgument(argument.get()))
     {
         std::cout << "Not support,please check parameters or device";
         return 0;
     }
 
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    float ave_time = invoker.Run(argument.get(), StreamConfig{nullptr, time_kernel});
 
     std::size_t flop      = conv_param.GetFlops();
     std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp b/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp
index f0896e977144867421b857afd0a58a239716712b..6b84eaba471a966c89b62d9f3be32d9b0e0834d6 100644
--- a/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_bwd_data_common.hpp"
 
diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp b/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp
index c4f2c1f02bb9010ab3f5d97166819732e911b68a..c9989c60ac2c74e6821bdf6dd40e703998accf01 100644
--- a/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "convnd_bwd_data_common.hpp"
 
diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
index c2e3602a7bb22e3f31c66bb6045171229584fa14..e363dc5c12dd84a36a67e969b06d29a179679f94 100644
--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
index bee5dea546f6da3cc23aa539583915c9b8a3b21e..24c8d82f674d97afae8cb1b3dc0274c1c13daf35 100644
--- a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
+++ b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <cstdlib>
diff --git a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
index 6fc63b899e5c6abc91cad26c92c7d7733f0e8f5a..3c04c561403d0ecfb0819aa569bdf24511053442 100644
--- a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
+++ b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <cstdlib>
diff --git a/example/19_binary_elementwise/elementwise_add_1d.cpp b/example/19_binary_elementwise/elementwise_add_1d.cpp
index a5a6bc0a8bedee0a16906381c91a9c93b070e572..1ac09641a1e20a6be8bf44f4f1ff0f3be045dedb 100644
--- a/example/19_binary_elementwise/elementwise_add_1d.cpp
+++ b/example/19_binary_elementwise/elementwise_add_1d.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <cstdlib>
diff --git a/example/19_binary_elementwise/elementwise_add_4d.cpp b/example/19_binary_elementwise/elementwise_add_4d.cpp
index cc209b12e3df12413beabfd11961671f4e303b41..e571aa8468008a5f4e7097eee74763a583faf9af 100644
--- a/example/19_binary_elementwise/elementwise_add_4d.cpp
+++ b/example/19_binary_elementwise/elementwise_add_4d.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <cstdlib>
diff --git a/example/20_grouped_conv_bwd_weight/common.hpp b/example/20_grouped_conv_bwd_weight/common.hpp
index 3f4818d2e3336799c3dfb5cca4fb14d9428ab58a..15727495f0f1f8dfedcb24a0ffa02d3d7aea67e0 100644
--- a/example/20_grouped_conv_bwd_weight/common.hpp
+++ b/example/20_grouped_conv_bwd_weight/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
index aed6d22b023aafc35e60fac29b6375a8602b87d5..3cd70d0f36fb12a33e05a48545fe0aaa5f0f34bb 100644
--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
index 4a2a6195d9574c26e3526d570bfddff067bd3e37..966b5867672b2a641210fb275688f2f19c70bb8e 100644
--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
index d46b37476697cec7121d26bdafdd4fb8bb4bb64e..39b9100bf8b34f26055798d745cb3cf422e53d70 100644
--- a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
+++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 template <ck::index_t NDimSpatial>
 using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
index 192fe87b626ff721ca106b9c633c109483f5772a..96d04dcb37798fb6db3334bd2d802e435fdc93fd 100644
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
index 3f01e6947728ecbef57734190b3896ff53899b02..fc58ca19f8673aa0b2205df21350f7ddf0b92196 100644
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
index 4da6da65f7ab21b6ffae367747ccf5f0c71232dc..bd1d6932aceb72890dcb0f7c35004cc0dccd93af 100644
--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
index e7d857c4a0fa53a262243b759ddf9b33c691e26c..90d80f9f034b391f75c498f3a34232edf64f5260 100644
--- a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/22_cgemm/cgemm_xdl_bf16.cpp b/example/22_cgemm/cgemm_xdl_bf16.cpp
index 92ed90ce4ab3ec09da2e053ceebe4d4ed6bcf36d..fa4482a984f20d203bd8bee68614c8e71ecbf83c 100644
--- a/example/22_cgemm/cgemm_xdl_bf16.cpp
+++ b/example/22_cgemm/cgemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/example/22_cgemm/cgemm_xdl_common.hpp b/example/22_cgemm/cgemm_xdl_common.hpp
index 6aa06b7c32cb476b61a88235e13a12a3d1a15db5..26137a7c2e50d0d5010639592c0a614e2eca607b 100644
--- a/example/22_cgemm/cgemm_xdl_common.hpp
+++ b/example/22_cgemm/cgemm_xdl_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <numeric>
 #include <initializer_list>
diff --git a/example/22_cgemm/cgemm_xdl_fp16.cpp b/example/22_cgemm/cgemm_xdl_fp16.cpp
index 11373736ee8b37efb5f4082253d46f02074f011f..89a581e865a56b231711352f9733403ed2945aea 100644
--- a/example/22_cgemm/cgemm_xdl_fp16.cpp
+++ b/example/22_cgemm/cgemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/example/22_cgemm/cgemm_xdl_fp32.cpp b/example/22_cgemm/cgemm_xdl_fp32.cpp
index 0f45c18c4818726f179adda70dc12b5ea7c45b9d..cf9659959990823f10ef0cb30fa0eef928c32adc 100644
--- a/example/22_cgemm/cgemm_xdl_fp32.cpp
+++ b/example/22_cgemm/cgemm_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/example/22_cgemm/cgemm_xdl_int4.cpp b/example/22_cgemm/cgemm_xdl_int4.cpp
index c26a83baafd375b477845fbdd793c3918d7b7dc5..f69cc2b3cc4f98e94ddbf0b37e8ac7324515a8d6 100644
--- a/example/22_cgemm/cgemm_xdl_int4.cpp
+++ b/example/22_cgemm/cgemm_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/example/22_cgemm/cgemm_xdl_int8.cpp b/example/22_cgemm/cgemm_xdl_int8.cpp
index 2f24189861d8468848d28b9c0480da7c3f2c4fdd..c4835b853ee75496f963a8e14d7a0190eb4cb1cf 100644
--- a/example/22_cgemm/cgemm_xdl_int8.cpp
+++ b/example/22_cgemm/cgemm_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/example/23_softmax/softmax_blockwise.cpp b/example/23_softmax/softmax_blockwise.cpp
index 41afd72f5ac9ddc3dc030a914e1a26da60c27356..d09e434bcfbb262a83c5167429d41d1ca54391ef 100644
--- a/example/23_softmax/softmax_blockwise.cpp
+++ b/example/23_softmax/softmax_blockwise.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
index c934d35019602e634da6f8b6d49c1c3b133c6800..420a7cf74f3186ac62d5dc37346a202178d8d273 100644
--- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
index 98835f98fa6ecea9c38f1155f5dd03d0787fa0d2..9d606db205dde86d544971b4b7fc4830ba73c568 100644
--- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
index 6004db6e0e1b0c28ea195066dbefcfd4253fadff..78522160c85a1024ec510197e5ab1069e6f077cf 100644
--- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
index 9576ce3f2c3d47ccf576c191e8eca1b0bd97ccd5..6cceed5bc11ed8eb79568a0397bbf7e1b93fd33d 100644
--- a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/26_contraction/contraction_scale_xdl_fp32.cpp b/example/26_contraction/contraction_scale_xdl_fp32.cpp
index 3aa2a7ba93858fb950eb785bac740cdfafe2b52a..1574f5d18fb7f5f193cd4adb1c1a88616d7b1437 100644
--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/26_contraction/contraction_scale_xdl_fp64.cpp b/example/26_contraction/contraction_scale_xdl_fp64.cpp
index cccf6505c6473567c0ee2f096e3a038e23385bca..3dacc708877d838dd5fb0113b5d560885b0551ae 100644
--- a/example/26_contraction/contraction_scale_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/27_layernorm/common.hpp b/example/27_layernorm/common.hpp
index 8d833a3ae9e765d96ba03459aa8b1ba18e4823ba..62a71713df84351fea902cdcf3275786b60f5a23 100644
--- a/example/27_layernorm/common.hpp
+++ b/example/27_layernorm/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/27_layernorm/layernorm_fp16.cpp b/example/27_layernorm/layernorm_fp16.cpp
index c15ffabf509415a9bb0a702997819a7da3bb8fb4..bb8b954f0acaf2e9d748104a05561f9cb02fc9a0 100644
--- a/example/27_layernorm/layernorm_fp16.cpp
+++ b/example/27_layernorm/layernorm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/27_layernorm/layernorm_splitk_fp16.cpp b/example/27_layernorm/layernorm_splitk_fp16.cpp
index 01ee7161eb05164b37b914d936488cca0145f851..e0378d028b343a6a70335cb9163370d1586ebf0a 100644
--- a/example/27_layernorm/layernorm_splitk_fp16.cpp
+++ b/example/27_layernorm/layernorm_splitk_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/27_layernorm/run_layernorm_example.inc b/example/27_layernorm/run_layernorm_example.inc
index 678d8df281684f19f92b892766292358e7c00310..95200b540aa9f9704c8fad785b352bdf779c4094 100644
--- a/example/27_layernorm/run_layernorm_example.inc
+++ b/example/27_layernorm/run_layernorm_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
index f8e6501eadaee4e5b4156309afebdf16e11f3e8a..24e9b1d9b7d569adf20caf1bd0f0cddafdbd256b 100644
--- a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
index 30ad38a566309dcfacd52f100099d8a3bf02f77f..62233e535151e6b647e938de04b6225fe487b6c4 100644
--- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
+++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
index 25d815b9cdfdddc29c699a1bc51dd5191a5246eb..08158bfc250642e032d20fad940bb8853c71d446 100644
--- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
index 04729968581398f71d333b02e9608e6fcc232949..9780a64cc28ce8985e9a83cdf6c55bac87f592fc 100644
--- a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
@@ -20,4 +20,5 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
 endif()
 if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
   add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp)
+  add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_int8 grouped_conv_fwd_bias_relu_add_wmma_int8.cpp)
 endif()
diff --git a/example/30_grouped_conv_fwd_multiple_d/common.hpp b/example/30_grouped_conv_fwd_multiple_d/common.hpp
index e7c6ed9b939abb0e8f593d54d5b45c499db5f2c0..e60ebee6e4fe2e08db432816b75bbbd4c388946c 100644
--- a/example/30_grouped_conv_fwd_multiple_d/common.hpp
+++ b/example/30_grouped_conv_fwd_multiple_d/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
index eb6975a6d812aef66f1caf493dfe82549a0e4ae9..ae769ff1d38306f4feed8d3cb00dde79b3f561eb 100644
--- a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
+++ b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
index 9d1d257a2889dcdaefb697faa6bcf6c172c55839..039d25029921491e7e67808e554b8cb3e6eb4745 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common_wmma.hpp"
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_int8.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_int8.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..793324970e700748ac4b18e9fc429a24f70c007d
--- /dev/null
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_int8.cpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common_wmma.hpp"
+
+// kernel data types
+using InKernelDataType       = I8;
+using WeiKernelDataType      = I8;
+using AccDataType            = I32;
+using CShuffleDataType       = I8;
+using BiasKernelDataType     = I8;
+using ResidualKernelDataType = I8;
+using OutKernelDataType      = I8;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+
+#include "run_grouped_conv_fwd_bias_relu_add_wmma_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
index ee300d073a28b44cf8177f64a7278d5e13ef3130..43c0d57dc2adcae69ecde327137eb204b71fa44d 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
index 5a9df0b1e880c8f3bc954c07cd5de3639644c4e1..40b4132b358d8259146c2987472dc7e9d6c71091 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
index c2906cc9dd1dbe22c8d420ce0929783fb0be0fa6..e05d384f26b40273a6bfe77222b805289a99e631 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
index 3d5a243e6b9282f9d362c82c0758776341df646f..5494563fdd568d1f51ec9ee9042d094096e8fc8a 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 #error Should compile this file with ck::int4_t support
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
index eaf680fa438a14ee402e3805c18692ffdcf78c7e..6bf2e8d963c1524bfec0680409580b47e5394bc5 100644
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
index 4561156e0bd3a935c49751802e8c3037d8735228..eb242203eaa65ba7b8fca45c5f1e3e1b9d96c409 100644
--- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 template <typename BiasLay, typename ResidualLay>
 struct LayoutSetting
diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
index a6888649c0ed0adc6f2084c428b2c847cd952f77..360b2c8947b371b1502afb3af73c469f0d8c3636 100644
--- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 template <typename BiasLay, typename ResidualLay>
 struct LayoutSetting
diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
index d087c31af5def501635ef173769851e111dec3cb..58ed69182e0622751dc3fa2823fcb7cfa5b279a8 100644
--- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 template <ck::index_t NDimSpatial>
 using DeviceConvFwdInstance =
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
index 74e0e07e62a99e0700213e80a066d7e15128e516..7605d9c4f8368e369e4c29167f90620984b5a6c8 100644
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
index d5fadb8081e86a83784f6fad7e7405a7808b65b9..33ed04fb3068b1bf83eb128825ccad355c6de4f9 100644
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
index 0dd4e0914f479eefae604ceba8115432ce1d3803..e0eb193ad0484c62ac6fa5695ac43aa17c509227 100644
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp
index 1fd93622a1b14a72d8159517a84b9f19d2959723..d166214c3376cd90afd11796a4cb85ea28421861 100644
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp
index 15d98abab7dfa0f1168fd411eeae481054053000..40f87d1f554c4ef2a73ceb32263c3499f733b40a 100644
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
diff --git a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
index 7e5f1614bcf6c6133d88f9d8dad63ec4be47c9ac..f329146728dd8df3c38af3d2520ac91d43d12d00 100644
--- a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
+++ b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
index 0eb15653306f0179b1d132ddfcd130af6ba13fd6..1d1566d57561afaa84be2de34b11416b5da0571c 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp
index 8f1db577c604418b7b1d546b5de251f5a20054a6..bae88d4b8e6f786219e8d68bd0a34f6036dd0d72 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
index 2ce91a8c6023314d61b354614c81ca74c5f95727..a098ce6675e05308fd9de223e807a0e14504a7f5 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp
index 1fd2bf69306f5b35dd4ea2ae484c677c6b3c63fc..ce8caf758842225098baac92876ea6996321ff21 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
index f4a8589052f0f149902553884a58cc4c6c79d030..138db14963809cd49294d1cfc2171b038dd10c60 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
index e4a71b04313446f75f4cbc73456a66bdbb62c47e..d0eb8fcc341818950bd39b7f91d95ca4f5eb4e03 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
index 38b5badc6e4f270e409ed98c492035c5bebe00c3..1d97474d205160e4a34d683f40f86f5e71b0a429 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc
index 4e43dbdd8fc5a83486cd052ee3e789e05152861f..27602e2313f7aa197e88e1fabeb39245e2fdf5eb 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 int run(int argc, char* argv[])
 {
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
index 0b876af952f65f89e9ae55bea344bfa6f61afd5d..fa76faea84e4551ddf8d0617c132dbe7a6045fb3 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 int run(int argc, char* argv[])
 {
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
index ef2acf61f55fa7f108e174d4586c13ae3dc7419d..ea1e2734a684b61a363eb93ef0e2ff933f900ea5 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 int run(int argc, char* argv[])
 {
diff --git a/example/33_multiple_reduce/dual_reduce_common.hpp b/example/33_multiple_reduce/dual_reduce_common.hpp
index 326606752b25394d504cb054cf3026173c66e970..cd21790be6548abd0f8097613ee650d4407fc400 100644
--- a/example/33_multiple_reduce/dual_reduce_common.hpp
+++ b/example/33_multiple_reduce/dual_reduce_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <cstdlib>
diff --git a/example/33_multiple_reduce/dual_reduce_multiblock.cpp b/example/33_multiple_reduce/dual_reduce_multiblock.cpp
index 9360599ed9e8a7517bc244e2942d6949c1f71e61..198931749b19b8e8d91bf16c1a733801cb27aeba 100644
--- a/example/33_multiple_reduce/dual_reduce_multiblock.cpp
+++ b/example/33_multiple_reduce/dual_reduce_multiblock.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <cstdlib>
diff --git a/example/33_multiple_reduce/dual_reduce_threadwise.cpp b/example/33_multiple_reduce/dual_reduce_threadwise.cpp
index 56255839e567fa0386ba09b80738f5d0d2abcc23..7609edad3527e8219c2ef6351381d6e53455a67b 100644
--- a/example/33_multiple_reduce/dual_reduce_threadwise.cpp
+++ b/example/33_multiple_reduce/dual_reduce_threadwise.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <cstdlib>
diff --git a/example/34_batchnorm/batchnorm_backward_nhwc.cpp b/example/34_batchnorm/batchnorm_backward_nhwc.cpp
index a6ca9d150bd918966bae06a63ee4eb9da6be5f3f..3756310fd7dd47b111b9fde743cac844ae2c7669 100644
--- a/example/34_batchnorm/batchnorm_backward_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_backward_nhwc.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <limits>
 #include <iostream>
diff --git a/example/34_batchnorm/batchnorm_common.hpp b/example/34_batchnorm/batchnorm_common.hpp
index bdbc8ea8b88f40de2f25a2ec8c5a74ab5e38fd74..a1b8d253bf061c02511945b98eac9343c0d1a236 100644
--- a/example/34_batchnorm/batchnorm_common.hpp
+++ b/example/34_batchnorm/batchnorm_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
index dc2984851a02e0020c8c4cfa6d59e61bfe49c593..6a8002025a60b277c587190a95855cc262c33e17 100644
--- a/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <limits>
 #include <iostream>
diff --git a/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
index da36d65a2954ee1ac3a94c617a219e6a7c2baf44..d6808181570d424b2d9584110c4bbf6c65f8c5a6 100644
--- a/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <limits>
 #include <iostream>
diff --git a/example/34_batchnorm/batchnorm_infer_impl.hpp b/example/34_batchnorm/batchnorm_infer_impl.hpp
index 15170586b636825325c796cdc20eb70996a24bb7..d0b545b2a31d2fc095eece12ce273db823a5d2ac 100644
--- a/example/34_batchnorm/batchnorm_infer_impl.hpp
+++ b/example/34_batchnorm/batchnorm_infer_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
index 7191ecf50ab5252b37855072b4e03f1194df52f6..1dc21a6c23ea69d60f0c51fd70b544899496afe8 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
index efdb315b4e5576aef8d992f6d57c231ac1f83a81..74fb16e15b019e145cdf13549254be1b09fbd8ca 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
index bc2e3d1d52b668a2ea15c9f4ea1db44c4faeb19e..7506f694204b3c3aaa564704eda4f358baf17861 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
index 4eb27824628d8f193ca6108a589794041fc2b2fb..7ebf9144082de53d64f3662e579dd6d246266f1d 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
index eefdbca6b1ae3e49cc221b3037e46888b4a9a9bd..0fc7a5cc29ccefdee6eef3f229715ee781e65223 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
index f0a0cdf6f13df7081d15ab0099122542ccf6118e..d2337dcda5d30eb57e0e5f764f047cf21314a190 100644
--- a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
+++ b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
index 071e8a7431c057a32ea0061655d81a202774b923..b3d0ab6bf77aa85fa6b48fa503a3f3f0dde1bf12 100644
--- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
+++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o]
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
index d07ee7bdc1c4b999f61a84727bb10fe7ea6f4c15..ca824b1075ced161760892fe6de53767b1079ccc 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
+++ b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_fp16.cpp b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_fp16.cpp
index 55ea8c3a3109469532d0e5c0566da3eb11e7353d..a3533bb4cc4b4874f0e7ae9f333347913d981ba0 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_fp16.cpp
+++ b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_fp16.cpp b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_fp16.cpp
index ddf82ec512c61c72a1a2e2323a6645f39f15ce6d..fb688b6f3f15383c2920a12e8902c2d4cf05a109 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_fp16.cpp
+++ b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc
index 0afd8bd70da849085db6e19583e933c3d1b91968..0f0b120cbcc578b2fa7f650c37c9a28148dd834a 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc
+++ b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 bool run_conv_bwd_data_bias_relu(const ExecutionConfig& config,
                                  const ck::utils::conv::ConvParam& conv_params,
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc
index e50c98bbe844f6ec438c781b7d5d8e254e57d98f..25678491ce90cdf1ab9455f14e091bd5fcd67431 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc
+++ b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 bool run_conv_bwd_data(const ExecutionConfig& config,
                        const ck::utils::conv::ConvParam& conv_params,
diff --git a/example/39_permute/common.hpp b/example/39_permute/common.hpp
index ab612cea1794c422c737a02e5bbe7a6728904c40..54f3a788097bebc6e293389cecd03d8ae68d8494 100644
--- a/example/39_permute/common.hpp
+++ b/example/39_permute/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/39_permute/permute_1xHxW_fp16.cpp b/example/39_permute/permute_1xHxW_fp16.cpp
index d7f9b80544a452ca7ea062d7d6135b92a2bf19ea..7336c3b631bcadd7d23a2d6b8e0b3d3a31787516 100644
--- a/example/39_permute/permute_1xHxW_fp16.cpp
+++ b/example/39_permute/permute_1xHxW_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/39_permute/permute_HxWx4_fp16.cpp b/example/39_permute/permute_HxWx4_fp16.cpp
index 342aa134ec5570c84447b137e5be165c9a5f697c..6c24919ded6a0c65556e305455861ca137203b3e 100644
--- a/example/39_permute/permute_HxWx4_fp16.cpp
+++ b/example/39_permute/permute_HxWx4_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/39_permute/permute_NxHxW_fp16.cpp b/example/39_permute/permute_NxHxW_fp16.cpp
index b53975eb2c8632583f61afde50a03eba32c17c9b..3551d2a7c8decf31a48ca628d99a53639f4fa1fe 100644
--- a/example/39_permute/permute_NxHxW_fp16.cpp
+++ b/example/39_permute/permute_NxHxW_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/39_permute/run_permute_bundle_example.inc b/example/39_permute/run_permute_bundle_example.inc
index 70406d63f91b95f4d1cce025051a74a0ee3d114e..2c198729226243fb1fb97db6beefa1fd1f62f625 100644
--- a/example/39_permute/run_permute_bundle_example.inc
+++ b/example/39_permute/run_permute_bundle_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/39_permute/run_permute_element_example.inc b/example/39_permute/run_permute_element_example.inc
index bc6235303039c7cb33a4057446137a9f18ddb186..35871344567ddf62a4fb3c2bd6a2e77cb94a1998 100644
--- a/example/39_permute/run_permute_element_example.inc
+++ b/example/39_permute/run_permute_element_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/40_conv2d_fwd_quantization/common.hpp b/example/40_conv2d_fwd_quantization/common.hpp
index 6ee14d750ef6c9f8308ff7b17b86626fc70ca2b8..266b09145c84c456aedb2182554ced7815457410 100644
--- a/example/40_conv2d_fwd_quantization/common.hpp
+++ b/example/40_conv2d_fwd_quantization/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
index 5c445d9c50b6ea3235f045a7e569aaf87fb048ea..40b33852b5778f46e43ac9f527dfaec19c403697 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
index 0ff85f008fa6fd7101436ec55c02c1e1fbae79cf..fc081ddc543a43b364f3538af8f3aa89edc2c164 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
index f8f996d17e82948076cbb0fa52ff3c378f7cd7cb..c390f016a97c39d9c0e400f0e4ac826b41a18d69 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
index 3b25fec0c4a1bcbf89aacd2c1c506358a09000be..10b131a5271bf7347054b2bb428ddf0311fef349 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
index a98a1e240bcbf390cfaed0ec9230f10d88ed90ce..e59d0d075959446b8164e50db7119e8bd1d1cf54 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
index 262594d58b3d3e591ab38209d86c3b683bd7680e..aee5fe9e60b0a3dccb3c3209a297001c55bd8ff1 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
index 6b22055053d3f45ce6f22e03bff25e5b5b7484fa..06c839e4e226e082e9754f12ce39cececbd36269 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
index 1ac8679743777db2c5d99370b8b22ea8004bc8d6..7a9b42d39f971ab1c36476f4c0ead172cb6d2606 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
index f28abe5ebc92d179fd65367b028e76bd4de0cbe4..3495636297d5d83fb3ccc7d236e6135de9b715fd 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
index f468e8adcde28b7fa0099405b76dca7b9d29ce64..2611337254212b326f6eef540c6a4eca5cd19261 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
index 5675db77fae1a88a7de156a75c86f50e76f1fde0..e5b924ad5114a6e31a1ba2875118085329d5e8ce 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 template <ck::index_t NDimSpatial,
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
index 9fd19c1c4e1a96ddae4544ae794954296b700354..9f3a769dcf0979d407169610f5cdbe15dbccb26e 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
index cacedfdadd20553c19d507193ce87480660b5b28..9b08fc690d8298b87ce3ebf89f462f3739734815 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
index 77332cb6d990f3a91987ea42163f7a414f26dd09..267c737e00fc5046e06551ca3e366d01dc78d230 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
index 2aea08c40004fac22ea0e45521ad09fd806751d8..e37d41369599819bdf1b9836c1a0587be2e31a0f 100644
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
index b7f80e76d6ce5a690beef4f48361e0a65402cb47..496e676a402f4d12a325c2afef56ec422cb4d721 100644
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
index 15e460948ef528b697a4e3bad7d40ca332f85c15..35d50721dcea27b2bfb42eea6e39da08709dfc7d 100644
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp
index 2cc4c07c0d89fc83b233b50967cd595dc0e53ac7..80f6e9ae05712b1df4b05b1bb75fbff62d9ca70a 100644
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 #error Should compile this file with ck::int4_t support
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp
index 40ff0f69cc14cd644361f3391eea8ed52380adc0..3ade6c811ae5583bbb8a2a3b277c6f4e7f265a87 100644
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc b/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
index a2c97f4d421f9382aa279b257a49299c4016e2c3..0722d497d8df0ebf0acf0cf798215bebcdee6e3a 100644
--- a/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
+++ b/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/42_groupnorm/common.hpp b/example/42_groupnorm/common.hpp
index 780154b26cb0287ed50ff4091aed733d59ee12dd..c8f91eb53b5b943bfbdd264b842cf7762a176b82 100644
--- a/example/42_groupnorm/common.hpp
+++ b/example/42_groupnorm/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
index b07a26c4c937b7e26420d824b5d00ea7d85b4f36..cc107b63dcd87735ba40ebbe7633c61993406166 100644
--- a/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/42_groupnorm/groupnorm_splitk_fp16.cpp b/example/42_groupnorm/groupnorm_splitk_fp16.cpp
index fd4bfe38072082cce83be08e2efac101024408db..057b240a63fc38217c095b6f3a058e1d4db6ea27 100644
--- a/example/42_groupnorm/groupnorm_splitk_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_splitk_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/42_groupnorm/groupnorm_swish_fp16.cpp b/example/42_groupnorm/groupnorm_swish_fp16.cpp
index c52243bfb0c4724df5fe418dc741362f8a5c92d5..363f22ed4c0015c9a96aa98a8b6b5302978e3d25 100644
--- a/example/42_groupnorm/groupnorm_swish_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_swish_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
diff --git a/example/42_groupnorm/run_groupnorm_example.inc b/example/42_groupnorm/run_groupnorm_example.inc
index d1016a3b12dbd95d4230678c4a34fcc78b105bbf..16065c8d46a42433416e21e9607dc3fcb708d817 100644
--- a/example/42_groupnorm/run_groupnorm_example.inc
+++ b/example/42_groupnorm/run_groupnorm_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
index 7ac4b68272e2b0a1a5cbc09fa3734e190d6897da..b6d9b29a5b757f024ca09c52d9b8ce3f5430e555 100644
--- a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
index 764e55ef558bd977fca8f25f16b2bba4c111114b..60a0e01fe3e3db9f490304bcd4bbe322a04d3c0b 100644
--- a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
+++ b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
index 7d6ff12eeafe8de6f45e93567959d86d748c783c..76361f87a5b58071b149cab505f7c0485abfadab 100644
--- a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
+++ b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/46_gemm_add_multiply/common.hpp b/example/46_gemm_add_multiply/common.hpp
index 3ba78dfe47ba4cac2f647bf5cb609561e6989275..2c656cf44159a594f0b8ba3b40b7d8d936d57cb2 100644
--- a/example/46_gemm_add_multiply/common.hpp
+++ b/example/46_gemm_add_multiply/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp b/example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp
index 28c3939fa611d536600bdce9c7609705e0373da1..58a399f226c1b59073dd000e382e673badb5e1fb 100644
--- a/example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp
+++ b/example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp"
diff --git a/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp b/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
index d5aa41f1b6d5a97c5c1b614e2765a86fde7343e1..56417b101d9528674535ab93f6fdc7ab36d47c3d 100644
--- a/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
+++ b/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
diff --git a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
index 30c98e534a99fa05ff6ebec044c5b70df7419254..cfb42c6e17a5e002cdf43a4d675584da2aa2facc 100644
--- a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
+++ b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/example/48_pool3d_fwd/pool3d_fwd_common.hpp b/example/48_pool3d_fwd/pool3d_fwd_common.hpp
index 5706deb6d190faa4d9addf7b018f98befdac9ab2..565bb94e47e13af73b4d49361f319c50252e84f0 100644
--- a/example/48_pool3d_fwd/pool3d_fwd_common.hpp
+++ b/example/48_pool3d_fwd/pool3d_fwd_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 
 #include <iostream>
diff --git a/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp b/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
index 4d3686bcb0c1f991f29da38ec5b99700e4c8155a..9afb51201deebe47b71932ab8fe1b9ffb50edcd6 100644
--- a/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
+++ b/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 1626597ed2164311535e1e570fcdeb9c44cc3fd1..314e6a81349f91cb7ad9d46bf75e525cd7df4314 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp
index e2cbdb733272d37aa5dbc7a746e86911d6b8644f..bd02d5d88a2b0a6f882ac58bfada67649e210bc0 100644
--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/host_utility/hip_check_error.hpp b/include/ck/host_utility/hip_check_error.hpp
index d3dc8eaf1eb8b87207256ea4a521e23d5a49ca9c..af7bebd9d6afbf59f1852156b1708ed843d9a9f6 100644
--- a/include/ck/host_utility/hip_check_error.hpp
+++ b/include/ck/host_utility/hip_check_error.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/host_utility/io.hpp b/include/ck/host_utility/io.hpp
index ac8719592db8b48996bfbc65dc656d6d96bde545..55734bab2e469a12cd02c417cbb25d9c8a49727c 100644
--- a/include/ck/host_utility/io.hpp
+++ b/include/ck/host_utility/io.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp
index 24f2121674c7d371a7d13c0b1c83e7f0a5e059be..58740b43517b9e75bdab59d2e8134d5c6dd97854 100644
--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp b/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
index 41267536551ea298e111f6d93b8e6f3f8f9ed475..6b118e972e67897f2c9e0cad3a8f959760f82f6c 100644
--- a/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_TRANSFORM_FORWARD_CONVOLUTION3D_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
 #define CK_TRANSFORM_FORWARD_CONVOLUTION3D_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
diff --git a/include/ck/stream_config.hpp b/include/ck/stream_config.hpp
index 70ca34555a01436c79ba244ac03572bb4e9520b4..505a602b240428bcd1f0f81018fef0c4716c80b7 100644
--- a/include/ck/stream_config.hpp
+++ b/include/ck/stream_config.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor/static_tensor.hpp b/include/ck/tensor/static_tensor.hpp
index fee679f91060aca623ab498362de78fd8118fee2..d719ef9760d79297600d7524167eba78cd137831 100644
--- a/include/ck/tensor/static_tensor.hpp
+++ b/include/ck/tensor/static_tensor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_STATIC_TENSOR_HPP
 #define CK_STATIC_TENSOR_HPP
diff --git a/include/ck/tensor_description/cluster_descriptor.hpp b/include/ck/tensor_description/cluster_descriptor.hpp
index 0c9ea2ff2a0d73b793008a954eaf7293b33ade08..2dfcad8e042e548d15b4bd963fe615e39c04eff1 100644
--- a/include/ck/tensor_description/cluster_descriptor.hpp
+++ b/include/ck/tensor_description/cluster_descriptor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_description/multi_index_transform.hpp b/include/ck/tensor_description/multi_index_transform.hpp
index 4e4d7593e9083f321f1cabe65faaee3c14259666..6854226dd4da975a8df4e5d516232f69476f515f 100644
--- a/include/ck/tensor_description/multi_index_transform.hpp
+++ b/include/ck/tensor_description/multi_index_transform.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_description/multi_index_transform_helper.hpp b/include/ck/tensor_description/multi_index_transform_helper.hpp
index 044a90370095eb53b94b5c2fba81abdbeae82c00..af0a8a34d0e48bde494dacfff5bbb17eda5eb479 100644
--- a/include/ck/tensor_description/multi_index_transform_helper.hpp
+++ b/include/ck/tensor_description/multi_index_transform_helper.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_description/tensor_adaptor.hpp b/include/ck/tensor_description/tensor_adaptor.hpp
index d42e0a6ff08f60eb1cf6e0f241f93e528bf3514b..3ffac32469a8d97d678bc6b5fe62a7cc5e0b24ab 100644
--- a/include/ck/tensor_description/tensor_adaptor.hpp
+++ b/include/ck/tensor_description/tensor_adaptor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_description/tensor_descriptor.hpp b/include/ck/tensor_description/tensor_descriptor.hpp
index f07d5b1733d9cc96dfac9f18cbbe30510760cabc..f1df2eedd466c81e4b7938c4808075869e6309fd 100644
--- a/include/ck/tensor_description/tensor_descriptor.hpp
+++ b/include/ck/tensor_description/tensor_descriptor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_description/tensor_descriptor_helper.hpp b/include/ck/tensor_description/tensor_descriptor_helper.hpp
index 461aae72cf7b1879c90e473adb3814e1c4875b52..f3ac041bf9b8dc03802d170fbd7bf08ce6ab9cb1 100644
--- a/include/ck/tensor_description/tensor_descriptor_helper.hpp
+++ b/include/ck/tensor_description/tensor_descriptor_helper.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_description/tensor_space_filling_curve.hpp b/include/ck/tensor_description/tensor_space_filling_curve.hpp
index 17c9100b9fd76418d562c5c175ad24062d8c3415..9a326092d2e0fd8392ec42a8c0a82b4167076373 100644
--- a/include/ck/tensor_description/tensor_space_filling_curve.hpp
+++ b/include/ck/tensor_description/tensor_space_filling_curve.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
index 8b1b7be11ef7b3cd19f11bd400b8a96146921bf7..b3caa3214a80893ad1acbf7b5f385bcbc513670b 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
index 33120bd86ff01da47f7b80c593bc966785cb3711..b0143366c1de5f8e6ec52cceefd3c20e4e77cbb7 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
 #define CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
index f45655721fe4de11b50b91e0dc5d22790ff73bea..0d092da5168c5d5629eecc2aac7cbdd53e210277 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
 #define CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
index d75f37d7b3915408b05c1bc77b478ab3e41dc8b0..5ec964bd3fdbda9319d1c6eab0ccb207775a6137 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
index 5328dfde9bc09039af03892467d94545002c56fd..d5a64d7aa6fe6c96f384d3f00af642231b6b2c53 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
index aa814ab00939d0c36866af01b6cf2057dcbd5121..8ae1ba3f34c10d8359b470ed1172f1b70a7fa8b5 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp b/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
index 7e62a822a8f042d238905295b7f88c1d9bdd88f8..82bcff694757e2a4e1c0c8e9fee02c6bba51ca3e 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
index 03e4d42d3a1f9eb1b180c95368b905e619e67110..d8da134a3415a6976b27d0a4fdd7f13798d0245b 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp b/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
index 316508651e4bd4dad2e47edf020a4145176fa753..a3813ea248608252bbbfe49e52c96e66534bf290 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
index 2163ad32383d4f940ab4cf3b041ab2d5c61456cc..6c13513cfb01a2be9b6a8acb161a843ab4c09b7d 100644
--- a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
index 04ad75bd7de4a9657c2aaaf21b40205c3ff9c7d8..c8690e5f68765622222ac3a8cff7c59e6fc06e91 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
index 5c47a49b38b6a30eebc9190cc338d4dbc0bc8524..905a59f56e3b42bfb4b2bdf8652a3e81b2872076 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
index aa33fc083f15cfd8904c8fa086a866dbc7817e7c..17110c8358dbaf7e602e7f9822968e08f80e14ef 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
index eb5f589a4ada976c9be4a5001fb6fc288c7e9a43..9a5317dd126a818bc88043408427f0f105cb7f6e 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
index 3bd7806389b59542b191e61f5369aebf7300fc6a..993d90e356def70ba54e357889277188b4f21971 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp b/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
index a4a29f5d5edc3b840fc7489d5f706231990cad70..f3263c7216a61528bed5408d0a950f8ae40983e1 100644
--- a/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp b/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
index 20b2a152b9d276b0bd0f6e1aa5798b4817b013a9..01bb806789c3d9e50f706d7c88982b4a6624ff57 100644
--- a/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
index 953ff1e06ed07b24968c7f5c0842161ac66643ed..adfa1689c66509c8c194985365356740d4c90473 100644
--- a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp
index 5946daf21ec169612c3207282007e05671882f09..198169011107fb0f236d4657c399ff0534ce2c98 100644
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp
index 9fcd893c7a8c38d31f480b13eb8e7bfc97ec7fef..ee7af0117d17acc561e5943ac59611e188b6ba4c 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
index e755913280f73e325daf1352324bab8d3df8a3a9..6cc2c7bb2f6c176f2d84fdda4be2140db5564360 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp
index af681127f30404f182475039add4443e46213398..91b4b6b91b6b6be5d08d88e5ca68c9b62fb02aea 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
index 116e62c00907e0f72031552f46b8a3ea975a18f5..f18dc3290600e63edb869d93dcb1a903206e32ab 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp
index eacc5976d3ea3885a8294f1ee74a249f0a3d52a7..8234e29486b1af8ae0dd43e58d4be9b63b5ed34f 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
index c1f85e575ce4b2e0d70c05010bfca6e9ed9b84a8..09259224e75cc67e092eeeaab1af793b050cc1f0 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
index bde71806daa0543c41813f2afb0dd34f16bfb76e..be8105c96726dfb3d8931b376a668c3296040706 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp b/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
index d39f3b7cbcfc9ccaccca5f755be747cc4896dc1b..2c0da692570b455da42540bb72793b9e629cb823 100644
--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp b/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
index aa93dd9c19d77fbee7ccdafd1e41e4c4f64735cd..e3962e177ee824981c35fbe2ba04664e4124315b 100644
--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp b/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
index 8a00fd9db3327a7c536b1ada0ca090bfa847a256..69103b6f44297ed65ed4233e3d458ac6927bcd16 100644
--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_cgemm.hpp b/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
index aedae53800b03cb78e6af5b15c8c6dfcd9792eea..8484212118c796f68d1bf066eab7b8ddbb231797 100644
--- a/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 #include "device_base.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
index dbc525c099bca293d106ac6d3c37ac72b2d749cc..118ade8978643f10dcfafbefb552fe485d6697e4 100644
--- a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp b/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp
index 82054a3c9423f819d17a003b8c22844f0a2273fe..eb1b85ec822aa96822b5a288d55a5d3a0898f1e9 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp b/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp
index 4b9881088dde07a62e92c5f77a19edc9c4b7670f..4dc11dbefd73619c39febe31a91a953b25049f05 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp b/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp
index 5a627deeb2221f5e271532d45bc8a544c8cceeba..7d3845666cf2289f34b186a5aaa8ee616812f30a 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp b/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp
index cc139303c929ae999d182ad09e9a4ef33f5209e1..3a49ac632e7de745e78ad5670dc57f425ab9eb33 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp
index f9f913a7c1f1a6cdfdb41f075f2609e8e2407d07..db0e4bd83f46ad001492876ff8a712895a873c22 100644
--- a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
index 9491a92247c71e3938d0f7d81cc0d82741b693de..c56a947ec9e11a0a5dce2cd39ce86d8e6a3efb8c 100644
--- a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_gemm.hpp
index c0af6f80faf606a22678e516be994a75c1d56eca..adf909821dbbdde12526513a7b6218e8ebbd219d 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp
index 4c2161eaed5b9c7d7913686f44c360c044ca3b07..a7f42c3b35e237cb4ad9edcbcd137f453af59051 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
index 9113bb7b7454443cce4871bf263433d4f907bfba..a44356dc2405bfe6ed448a8193d55e86bd29415c 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp
index a67a09b87416bf11cc285ab1b1d9dc684b17103f..0258858fe50e2d433e5ddfe007e8d06227b47c0c 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp
index f4881e32f620be9c31649085813a257e9e84a598..539e83f7cb8a5dfe99b2ecb3da491f9661330265 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
index fcc088ca43d1b8c6224c0015e3eb7434038af4b2..eaa7671c6424b4ecf5ad87c4e790a9c478645b9b 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
index c701bff57f8eb7db155e9abdfd3ab7210e7eeffd..6407aa7e09b81a4dc09dcb1bdca5701aa86d9e14 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
index af38f142549c6475a7fa983585f0a845286aa16e..d00e19b440a68682a16ff111e0469dc255057b1a 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp
index 173c613a325d8c594298ff6751060fecf2f8d453..ba81948440acd892a9877a7615f95642628b6347 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
index 3350aec8d3174010cf0d926c3a91802ce3b8e7e7..7e4bca2bd66a55c70b6a9365e2f1debcc22bb119 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
index 1258aed71c502c659f065d08e8b1a84922c26c83..de54f9be20bd05d46bce8f49e007b253f02e78cf 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
index 644c7ee9a9107718dc609836e6cd8abe7f2dad21..025c43e75cc204b5bb8b3bdc1083d30aaf760bbc 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
index 6180333350ad414d5bcdaac78e7892546757d605..f26974ccb9f68990e3ea72561828da6812707230 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
index 80c864c8366af105421d3aadf46f620f9cde17e9..9080936589531fb7c8afbcc84dabcace97bfc812 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
index 4b1106c122cbbef1629fff0ffc5ea23ddf7fec3f..1e03405536a20a9f050a7943243f1c406594b836 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
@@ -1,4 +1,8 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
 #pragma once
+
 #include <iostream>
 #include <vector>
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
index b066a4458518b378313225e24817253922962d29..fae65097407cdb26df206308442874c0fc326fe6 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index 0b1db284652f4c542c24bf95436ec92323a66dbc..30e29cc8e2b76d44096722d79c932a6c2b796851 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -611,10 +611,12 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
                 some_has_main_k_block_loop |= y;
             }
 
-            hipGetErrorString(hipMemcpy(arg.p_workspace_,
-                                        arg.group_kernel_args_.data(),
-                                        arg.group_kernel_args_.size() * sizeof(GroupKernelArg),
-                                        hipMemcpyHostToDevice));
+            hipGetErrorString(
+                hipMemcpyWithStream(arg.p_workspace_,
+                                    arg.group_kernel_args_.data(),
+                                    arg.group_kernel_args_.size() * sizeof(GroupKernelArg),
+                                    hipMemcpyHostToDevice,
+                                    stream_config.stream_id_));
 
             float ave_time = 0;
 
diff --git a/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp
index ee4b53e2fcc43f44aba29c3409963a1b94834cbc..f68022ca04de757fee1240ced4b504601b183d91 100644
--- a/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_normalization.hpp
index 03601ce8312ccea06d60236779aa78a9ea306e01..1f178f9fcb65ffdd7ab09146d94131c1d5c993f2 100644
--- a/include/ck/tensor_operation/gpu/device/device_normalization.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_permute.hpp b/include/ck/tensor_operation/gpu/device/device_permute.hpp
index 9daa2be37338c8309bd2131f263d324fe15b7ab2..c994cf02c6aab5b85fcd2fcc29f2716de28b861d 100644
--- a/include/ck/tensor_operation/gpu/device/device_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp b/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
index e801e98a25e90b98de81f4b032df4c5140d86c57..8b227fdfbe62798f32c98561f5235ef0c5263de4 100644
--- a/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce.hpp
index c9209f2d7d681b44c8a27dd06626eb8a4b998a39..c2721b18455c6a46e5973fad32767ae43247f59b 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_softmax.hpp b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
index 94f788e5177cd78d0ea917b0869c576ba8fe7bfb..a96ba89e24943739c8177fb3439d812a3f70a0d2 100644
--- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp
index f59e6093e2ae024ccbf8827082c755eba8cee88f..eeccd977ccbf4440a349845826d45d5a3274ad37 100644
--- a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
index 70990e795cc7ea594a94aee24737fc3dfc6ba664..95517b107452101a6229d7071689cb0cb9973e46 100644
--- a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
index fc913e9ba03611dfbabea020bd4054c0b953726d..0bb45b18c3e19b2ec5f9347c1e811d8734ee45a9 100644
--- a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
index 493822aeb27215406a8f7fad56021aca8e4246a7..4d599e8017991b4f1b1905de9acd23a219ac67f9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
index 9bf8f5ccd9ccecf14e1e0fdd005c331124f32547..bc86e78b6268a50b7d2f438f17a39f0913b7b475 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
index 20e9920d935c12f54dd7ea89af3279b829892fe7..09220813bf0bc55b4d7487aeeba97388a90de5cf 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
index 0df34609410a2d25f8f5eb2dfe7dda9831297512..2d91c620c6f2fd565c9576f36d750f12653c05e3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
index 196dc86da15411e95a17ea05eb484f716561c383..e39d8f0696074661d57e3594eb0a4c90e92f4e85 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
index ef9b90ba7cde65bf17336325be3c76e33c758e68..eff503f65029ce3eb7ef228d0dd5f909a22470d9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index 0c6c0ef7ad2ded496c0f03618a64dac8a2783b81..e54c013cf9345abe376a0698e3622c73f1849042 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
index 84edde63eceb141198ed249879db48131ff14d5c..9310d0752aa2f5e4b2599d735342ffdeb9845067 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
index d35f194171e710b490a70906ed2cf2079ab7b77a..87c58f371af671a2c1440c3af2d27961faa1357d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -45,75 +45,46 @@ namespace device {
  * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
  *
  */
-template <typename GridwiseGemm,
-          typename FloatAB,
-          typename FloatC,
-          typename AGridDesc_K0_M_K1,
-          typename BGridDesc_K0_N_K1,
-          typename CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation,
-          typename ComputePtrOffsetOfBatch,
-          typename Block2CTileMap,
-          bool HasMainKBlockLoop>
+template <typename DeviceOp, typename GridwiseGemm, bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batched_gemm_xdlops_v2r3(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const index_t batch_count,
-            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
-            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
-            const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-            const Block2CTileMap block_2_ctile_map)
+        kernel_batched_gemm_xdlops_v2r3(const typename DeviceOp::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx940__))
     const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        __builtin_amdgcn_readfirstlane(get_grid_size() / karg.Batch);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
     const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+        static_cast<long_index_t>(karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
     const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+        static_cast<long_index_t>(karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
     const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
+        static_cast<long_index_t>(karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
 
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                  p_b_grid + b_batch_offset,
-                                                  p_c_grid + c_batch_offset,
+    const auto a_grid_desc_k0_m_k1 =
+        amd_wave_read_first_lane(GridwiseGemm::MakeAGridDescriptor_K0_M_K1(
+            karg.M, karg.MPadded, karg.K, karg.K0, karg.StrideA));
+    const auto b_grid_desc_k0_n_k1 =
+        amd_wave_read_first_lane(GridwiseGemm::MakeBGridDescriptor_K0_N_K1(
+            karg.K, karg.N, karg.NPadded, karg.K0, karg.StrideB));
+    const auto c_grid_desc_m_n = amd_wave_read_first_lane(GridwiseGemm::MakeCGridDescriptor_M_N(
+        karg.M, karg.MPadded, karg.N, karg.NPadded, karg.StrideC));
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(karg.p_a_grid + a_batch_offset,
+                                                  karg.p_b_grid + b_batch_offset,
+                                                  karg.p_c_grid + c_batch_offset,
                                                   p_shared,
                                                   a_grid_desc_k0_m_k1,
                                                   b_grid_desc_k0_n_k1,
-                                                  c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  block_2_ctile_map);
+                                                  c_grid_desc_m_n);
 #else
-    ignore = p_a_grid;
-    ignore = p_b_grid;
-    ignore = p_c_grid;
-    ignore = batch_count;
-    ignore = a_grid_desc_k0_m_k1;
-    ignore = b_grid_desc_k0_n_k1;
-    ignore = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2;
-    ignore = a_element_op;
-    ignore = b_element_op;
-    ignore = c_element_op;
-    ignore = compute_ptr_offset_of_batch;
-    ignore = block_2_ctile_map;
+    ignore = karg;
 #endif
 }
 
@@ -171,93 +142,6 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
 
     static constexpr auto K1Number = Number<K1>{};
 
-    static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
-    {
-        assert(K % K1 == 0);
-
-        const index_t K0 = K / K1;
-
-        const auto a_grid_desc_m_k = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
-            }
-        }();
-
-        const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-
-        const auto a_grid_desc_k0_mp_k1 =
-            transform_tensor_descriptor(a_grid_desc_m_k,
-                                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                                                   make_right_pad_transform(M, PadM)),
-                                        make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-        return a_grid_desc_k0_mp_k1;
-    }
-
-    static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
-    {
-        assert(K % K1 == 0);
-
-        const index_t K0 = K / K1;
-
-        const auto b_grid_desc_k_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
-            }
-        }();
-
-        const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-
-        const auto b_grid_desc_k0_np_k1 =
-            transform_tensor_descriptor(b_grid_desc_k_n,
-                                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                                                   make_right_pad_transform(N, PadN)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-        return b_grid_desc_k0_np_k1;
-    }
-
-    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
-    {
-        const auto c_grid_desc_m_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
-            }
-        }();
-
-        const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-        const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-
-        const auto c_grid_desc_mp_np = transform_tensor_descriptor(
-            c_grid_desc_m_n,
-            make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        return c_grid_desc_mp_np;
-    }
-
-    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1));
-    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1));
-    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
-
     struct ComputePtrOffsetOfStridedBatch
     {
         ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
@@ -289,121 +173,82 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
     };
 
     // GridwiseGemm
-    using GridwiseGemm =
-        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                ADataType, // TODO: distinguish A/B datatype
-                                                AccDataType,
-                                                CDataType,
-                                                InMemoryDataOperationEnum::Set,
-                                                AGridDesc_K0_M_K1,
-                                                BGridDesc_K0_N_K1,
-                                                CGridDesc_M_N,
-                                                AElementwiseOperation,
-                                                BElementwiseOperation,
-                                                CElementwiseOperation,
-                                                MPerBlock,
-                                                NPerBlock,
-                                                K0PerBlock,
-                                                MPerXDL,
-                                                NPerXDL,
-                                                K1,
-                                                MXdlPerWave,
-                                                NXdlPerWave,
-                                                ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                ABlockTransferThreadClusterArrangeOrder,
-                                                ABlockTransferSrcAccessOrder,
-                                                ABlockTransferSrcVectorDim,
-                                                ABlockTransferSrcScalarPerVector,
-                                                ABlockTransferDstScalarPerVector_K1,
-                                                false, // AThreadTransferSrcResetCoordinateAfterRun,
-                                                ABlockLdsAddExtraM,
-                                                BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                BBlockTransferThreadClusterArrangeOrder,
-                                                BBlockTransferSrcAccessOrder,
-                                                BBlockTransferSrcVectorDim,
-                                                BBlockTransferSrcScalarPerVector,
-                                                BBlockTransferDstScalarPerVector_K1,
-                                                false, // BThreadTransferSrcResetCoordinateAfterRun,
-                                                BBlockLdsAddExtraN,
-                                                Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
-                                                CThreadTransferSrcDstVectorDim,
-                                                CThreadTransferDstScalarPerVector,
-                                                NumGemmKPrefetchStage,
-                                                LoopSched,
-                                                PipelineVer>;
-
-    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =
-        decltype(GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(CGridDesc_M_N{}));
-    using Block2CTileMap = typename GridwiseGemm::DefaultBlock2CTileMap;
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum::Set,
+        ALayout,
+        BLayout,
+        CLayout,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpecialization::MNPadding,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXDL,
+        NPerXDL,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+        CThreadTransferSrcDstVectorDim,
+        CThreadTransferDstScalarPerVector,
+        NumGemmKPrefetchStage,
+        LoopSched,
+        PipelineVer>;
+
+    using Problem = typename GridwiseGemm::Problem;
 
     // Argument
-    struct Argument : public BaseArgument
+    struct Argument : public Problem, public BaseArgument
     {
-        Argument(const ADataType* p_a_grid,
-                 const BDataType* p_b_grid,
-                 CDataType* p_c_grid,
-                 index_t M,
-                 index_t N,
-                 index_t K,
-                 index_t StrideA,
-                 index_t StrideB,
-                 index_t StrideC,
+        Argument(const ADataType* p_a_grid_,
+                 const BDataType* p_b_grid_,
+                 CDataType* p_c_grid_,
+                 index_t M_,
+                 index_t N_,
+                 index_t K_,
+                 index_t StrideA_,
+                 index_t StrideB_,
+                 index_t StrideC_,
                  index_t BatchStrideA,
                  index_t BatchStrideB,
                  index_t BatchStrideC,
-                 index_t Batch,
-                 index_t M01,
-                 index_t N01,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-            : p_a_grid_{p_a_grid},
-              p_b_grid_{p_b_grid},
-              p_c_grid_{p_c_grid},
-              Batch_(Batch),
-              a_grid_desc_k0_m_k1_{
-                  DeviceBatchedGemmXdl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA)},
-              b_grid_desc_k0_n_k1_{
-                  DeviceBatchedGemmXdl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB)},
-              c_grid_desc_m_n_{DeviceBatchedGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC)},
-              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
-              compute_ptr_offset_of_batch_{BatchStrideA, BatchStrideB, BatchStrideC},
-              block_2_ctile_map_{
-                  GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01)},
-              M01_{M01},
-              N01_{N01},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op},
-              kraw_{K}
+                 index_t Batch_)
+            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_},
+              p_a_grid{p_a_grid_},
+              p_b_grid{p_b_grid_},
+              p_c_grid{p_c_grid_},
+              Batch(Batch_),
+              compute_ptr_offset_of_batch{BatchStrideA, BatchStrideB, BatchStrideC}
         {
-            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
-                                           b_grid_desc_k0_n_k1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ =
-                    GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n_);
-            }
         }
 
-        //  private:
-        const ADataType* p_a_grid_;
-        const BDataType* p_b_grid_;
-        CDataType* p_c_grid_;
-        index_t Batch_;
-        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
-        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
-        CGridDesc_M_N c_grid_desc_m_n_;
-        CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
-        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
-        Block2CTileMap block_2_ctile_map_;
-        index_t M01_;
-        index_t N01_;
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
-        index_t kraw_;
+        const ADataType* p_a_grid;
+        const BDataType* p_b_grid;
+        CDataType* p_c_grid;
+        index_t Batch;
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch;
     };
 
     // Invoker
@@ -411,107 +256,39 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
     {
         using Argument = DeviceBatchedGemmXdl::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        float Run(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
         {
-#if DEBUG_LOG
+            if(stream_config.log_level_ > 0)
             {
-                std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
-                          << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
-                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0)
-                          << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
-                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.c_grid_desc_m_n_{" << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
-                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+                karg.Print();
             }
-#endif
 
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                            arg.b_grid_desc_k0_n_k1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
+            if(!GridwiseGemm::CheckValidity(karg))
             {
                 throw std::runtime_error(
-                    "wrong! GridwiseBatchedGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
+                    "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext has invalid setting");
             }
 
-            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.Batch_;
-
-            const auto K =
-                arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
+            auto [gdx, gdy, gdz] = GridwiseGemm::CalculateGridSize(karg.M, karg.N);
+            gdx *= karg.Batch;
 
             float ave_time = 0;
 
-            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(karg.K))
             {
-                const auto kernel = kernel_batched_gemm_xdlops_v2r3<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    remove_reference_t<DeviceBatchedGemmXdl::AGridDesc_K0_M_K1>,
-                    remove_reference_t<DeviceBatchedGemmXdl::BGridDesc_K0_N_K1>,
-                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    ComputePtrOffsetOfStridedBatch,
-                    remove_reference_t<Block2CTileMap>,
-                    true>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(BlockSize),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.Batch_,
-                                                  arg.a_grid_desc_k0_m_k1_,
-                                                  arg.b_grid_desc_k0_n_k1_,
-                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                                  arg.a_element_op_,
-                                                  arg.b_element_op_,
-                                                  arg.c_element_op_,
-                                                  arg.compute_ptr_offset_of_batch_,
-                                                  arg.block_2_ctile_map_);
+                const auto kernel =
+                    kernel_batched_gemm_xdlops_v2r3<DeviceBatchedGemmXdl, GridwiseGemm, true>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg);
             }
             else
             {
-                const auto kernel = kernel_batched_gemm_xdlops_v2r3<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    remove_reference_t<DeviceBatchedGemmXdl::AGridDesc_K0_M_K1>,
-                    remove_reference_t<DeviceBatchedGemmXdl::BGridDesc_K0_N_K1>,
-                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    ComputePtrOffsetOfStridedBatch,
-                    remove_reference_t<Block2CTileMap>,
-                    false>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(BlockSize),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.Batch_,
-                                                  arg.a_grid_desc_k0_m_k1_,
-                                                  arg.b_grid_desc_k0_n_k1_,
-                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                                  arg.a_element_op_,
-                                                  arg.b_element_op_,
-                                                  arg.c_element_op_,
-                                                  arg.compute_ptr_offset_of_batch_,
-                                                  arg.block_2_ctile_map_);
+                const auto kernel =
+                    kernel_batched_gemm_xdlops_v2r3<DeviceBatchedGemmXdl, GridwiseGemm, false>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg);
             }
 
             return ave_time;
@@ -531,17 +308,14 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
         return true;
     }
 
-    static bool IsSupportedArgument(const Argument& arg)
+    static bool IsSupportedArgument(const Problem& problem)
     {
-        if(arg.kraw_ % K1 != 0)
+        if(problem.K % K1 != 0)
         {
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                           arg.b_grid_desc_k0_n_k1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        return GridwiseGemm::CheckValidity(problem);
     }
 
     // polymorphic
@@ -562,10 +336,7 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
                              index_t BatchStrideA,
                              index_t BatchStrideB,
                              index_t BatchStrideC,
-                             index_t Batch,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
+                             index_t Batch)
     {
         return Argument{p_a,
                         p_b,
@@ -579,12 +350,7 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
                         BatchStrideA,
                         BatchStrideB,
                         BatchStrideC,
-                        Batch,
-                        1,
-                        1,
-                        a_element_op,
-                        b_element_op,
-                        c_element_op};
+                        Batch};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -603,9 +369,9 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
                                                       index_t BatchStrideB,
                                                       index_t BatchStrideC,
                                                       index_t Batch,
-                                                      AElementwiseOperation a_element_op,
-                                                      BElementwiseOperation b_element_op,
-                                                      CElementwiseOperation c_element_op) override
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation) override
     {
         return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                           static_cast<const BDataType*>(p_b),
@@ -619,12 +385,7 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
                                           BatchStrideA,
                                           BatchStrideB,
                                           BatchStrideC,
-                                          Batch,
-                                          1,
-                                          1,
-                                          a_element_op,
-                                          b_element_op,
-                                          c_element_op);
+                                          Batch);
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp
index ab16a757f699bb4e4ba781873bbf87538633d9c1..f46237e005642d433c29f5ab0f027a224043dbef 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
index 5a16ff765bbc5c0bc1b68d80262c88a0c8f84416..ce0320b2d7898f4e20dddc16bdf3984121e1d2b0 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
index 29978458bb202297868af555b3ffecb617e48fb7..9ee6c6f46bad3940515fef1c0cd391667672952d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -118,277 +118,11 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
         return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
     }
 
-    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
-    {
-        const auto a_grid_desc_mraw_kraw = [&]() {
-            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
-                                                    make_tuple(StrideA, I1));
-            }
-            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
-                                                    make_tuple(I1, StrideA));
-            }
-        }();
-
-        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
-        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
-
-        const auto MPad = M - MRaw;
-        const auto KPad = K - KRaw;
-
-        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad both M and K
-            assert(K % AK1 == 0);
-
-            const auto AK0 = K / AK1;
-
-            const auto a_grid_desc_m_k =
-                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
-                                            make_tuple(make_right_pad_transform(MRaw, MPad),
-                                                       make_right_pad_transform(KRaw, KPad)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_m_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_pass_through_transform(M)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MNPadding)
-        {
-            // pad M, but not K
-            assert(KRaw % AK1 == 0);
-
-            const auto AK0 = KRaw / AK1;
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_right_pad_transform(MRaw, MPad)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad K, but not M
-            assert(K % AK1 == 0);
-
-            const auto AK0 = K / AK1;
-
-            const auto a_grid_desc_m_k = transform_tensor_descriptor(
-                a_grid_desc_mraw_kraw,
-                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_m_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_pass_through_transform(MRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else
-        {
-            // not pad M or K
-            assert(KRaw % AK1 == 0);
-
-            const auto AK0 = KRaw / AK1;
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_pass_through_transform(MRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-    }
-
-    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
-    {
-        const auto b_grid_desc_nraw_kraw = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
-                                                    make_tuple(I1, StrideB));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
-                                                    make_tuple(StrideB, I1));
-            }
-        }();
-
-        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
-        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
-
-        const auto NPad = N - NRaw;
-        const auto KPad = K - KRaw;
-
-        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad both N and K
-            assert(K % BK1 == 0);
-
-            const auto BK0 = K / BK1;
-
-            const auto b_grid_desc_n_k =
-                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
-                                            make_tuple(make_right_pad_transform(NRaw, NPad),
-                                                       make_right_pad_transform(KRaw, KPad)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_n_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(N)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::MNPadding)
-        {
-            // pad N, but not K
-            assert(KRaw % BK1 == 0);
-
-            const auto BK0 = KRaw / BK1;
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_right_pad_transform(NRaw, NPad)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad K, but not N
-            assert(K % BK1 == 0);
-
-            const auto BK0 = K / BK1;
-
-            const auto b_grid_desc_n_k = transform_tensor_descriptor(
-                b_grid_desc_nraw_kraw,
-                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_n_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(NRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else
-        {
-            // not pad N or K
-            assert(KRaw % BK1 == 0);
-
-            const auto BK0 = KRaw / BK1;
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(NRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-    }
-
-    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
-    {
-        const auto c_grid_desc_mraw_nraw = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
-                                                    make_tuple(StrideC, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
-                                                    make_tuple(I1, StrideC));
-            }
-        }();
-
-        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
-        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
-
-        const auto MPad = M - MRaw;
-        const auto NPad = N - NRaw;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(MRaw, MPad),
-                                                          make_right_pad_transform(NRaw, NPad)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-    }
-
-    using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
-    using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
-    using CGridDesc_M_N       = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
-    using CGridDesc_M         = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
-
     // GridwiseGemm
     using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ALayout,
+        BLayout,
+        CLayout,
         ADataType, // TODO: distinguish A/B datatype
         GemmAccDataType,
         CShuffleDataType,
@@ -396,10 +130,8 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
         AElementwiseOperation,
         BElementwiseOperation,
         CElementwiseOperation,
+        GemmSpec,
         InMemoryDataOperationEnum::Set,
-        AGridDesc_AK0_M_AK1,
-        BGridDesc_BK0_N_BK1,
-        CGridDesc_M_N,
         NumGemmKPrefetchStage,
         BlockSize,
         MPerBlock,
@@ -433,108 +165,82 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
         CShuffleBlockTransferScalarPerVector_NPerBlock,
         LoopSched>;
 
+    using CGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
+
     // Argument
-    struct Argument : public BaseArgument
+    struct Argument : public tensor_operation::device::BaseArgument, public GridwiseGemm::Problem
     {
-        Argument(const ADataType* p_a_grid_real,
-                 const ADataType* p_a_grid_imag,
-                 const BDataType* p_b_grid_real,
-                 const BDataType* p_b_grid_imag,
-                 CDataType* p_c_grid_real,
-                 CDataType* p_c_grid_imag,
+        using Problem = typename GridwiseGemm::Problem;
+
+        Argument(const ADataType* p_a_grid_real_,
+                 const ADataType* p_a_grid_imag_,
+                 const BDataType* p_b_grid_real_,
+                 const BDataType* p_b_grid_imag_,
+                 CDataType* p_c_grid_real_,
+                 CDataType* p_c_grid_imag_,
                  CDataType* p_workspace,
-                 index_t MRaw,
-                 index_t NRaw,
-                 index_t KRaw,
-                 index_t StrideA,
-                 index_t StrideB,
-                 index_t StrideC,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-            : p_a_grid_real_{p_a_grid_real},
-              p_a_grid_imag_{p_a_grid_imag},
-              p_b_grid_real_{p_b_grid_real},
-              p_b_grid_imag_{p_b_grid_imag},
-              p_c_grid_real_{p_c_grid_real},
-              p_c_grid_imag_{p_c_grid_imag},
-              p_aux_grid_{p_workspace},
-              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
-              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
-              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
-              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op}
+                 index_t M_,
+                 index_t N_,
+                 index_t K_,
+                 index_t StrideA_,
+                 index_t StrideB_,
+                 index_t StrideC_)
+            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_},
+              p_a_grid_real{p_a_grid_real_},
+              p_a_grid_imag{p_a_grid_imag_},
+              p_b_grid_real{p_b_grid_real_},
+              p_b_grid_imag{p_b_grid_imag_},
+              p_c_grid_real{p_c_grid_real_},
+              p_c_grid_imag{p_c_grid_imag_},
+              p_aux_grid{p_workspace}
         {
-            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
-                                           b_grid_desc_bk0_n_bk1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        c_grid_desc_m_n_);
-            }
-
-            const index_t grid_size = block_2_ctile_map_.CalculateGridSize(c_grid_desc_m_n_);
+            const index_t grid_size = std::get<1>(GridwiseGemm::CalculateGridSize(M_, N_));
 
             if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
             {
-                c_grid_desc_m_ =
-                    DeviceOp::MakeDescriptor_M({MRaw, NRaw}, {StrideC, I1}, grid_size, BlockSize);
+                c_grid_desc_m =
+                    DeviceOp::MakeDescriptor_M({M_, N_}, {StrideC_, I1}, grid_size, BlockSize);
             }
             else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
             {
-                c_grid_desc_m_ =
-                    DeviceOp::MakeDescriptor_M({MRaw, NRaw}, {I1, StrideC}, grid_size, BlockSize);
+                c_grid_desc_m =
+                    DeviceOp::MakeDescriptor_M({M_, N_}, {I1, StrideC_}, grid_size, BlockSize);
             }
 
-            p_aux_2_grid_ = p_workspace + c_grid_desc_m_n_.GetElementSpaceSize();
+            p_aux_2_grid = p_workspace + GetCElementSpaceSize(M_, N_, StrideC_);
         }
 
         //  private:
-        const ADataType* p_a_grid_real_;
-        const ADataType* p_a_grid_imag_;
-        const BDataType* p_b_grid_real_;
-        const BDataType* p_b_grid_imag_;
-        CDataType* p_c_grid_real_;
-        CDataType* p_c_grid_imag_;
-        CDataType* p_aux_grid_;
-        CDataType* p_aux_2_grid_;
-        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
-        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        CGridDesc_M_N c_grid_desc_m_n_;
-        CGridDesc_M c_grid_desc_m_;
-        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
+        const ADataType* p_a_grid_real;
+        const ADataType* p_a_grid_imag;
+        const BDataType* p_b_grid_real;
+        const BDataType* p_b_grid_imag;
+        CDataType* p_c_grid_real;
+        CDataType* p_c_grid_imag;
+        CDataType* p_aux_grid;
+        CDataType* p_aux_2_grid;
+        CGridDesc_M c_grid_desc_m;
     };
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        using Argument = DeviceOp::Argument;
-
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                            arg.b_grid_desc_bk0_n_bk1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
             {
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
 
-            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N);
 
-            const auto K =
-                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+            const auto K = GridwiseGemm::CalculateAK0(arg.K) * AK1;
 
             float ave_time = 0;
 
@@ -578,224 +284,148 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
 
             if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
             {
-                const auto kernel = kernel_gemm_xdl_cshuffle_v1<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    DeviceOp::AGridDesc_AK0_M_AK1,
-                    DeviceOp::BGridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::DefaultBlock2CTileMap,
-                    true>;
-
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_real_,
-                                           arg.p_b_grid_real_,
-                                           arg.p_aux_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
-
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_imag_,
-                                           arg.p_b_grid_imag_,
-                                           arg.p_aux_2_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
+                const auto kernel =
+                    kernel_gemm_xdl_cshuffle_v1<GridwiseGemm, ADataType, CDataType, true>;
+
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_real,
+                                                   arg.p_b_grid_real,
+                                                   arg.p_aux_grid,
+                                                   arg);
+
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_imag,
+                                                   arg.p_b_grid_imag,
+                                                   arg.p_aux_2_grid,
+                                                   arg);
 
                 // c_real = aux - aux_2
                 ave_time += launch_and_time_kernel(
                     stream_config,
                     subtract_kernel,
-                    dim3(grid_size),
+                    dim3(gdx, gdy, gdz),
                     dim3(BlockSize),
                     0,
-                    make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
-                    make_tuple(arg.c_grid_desc_m_),
-                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
-                               const_cast<const CDataType*>(arg.p_aux_2_grid_)),
-                    make_tuple(arg.p_c_grid_real_),
+                    make_tuple(arg.c_grid_desc_m, arg.c_grid_desc_m),
+                    make_tuple(arg.c_grid_desc_m),
+                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid),
+                               const_cast<const CDataType*>(arg.p_aux_2_grid)),
+                    make_tuple(arg.p_c_grid_real),
                     Subtract{});
 
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_real_,
-                                           arg.p_b_grid_imag_,
-                                           arg.p_aux_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
-
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_imag_,
-                                           arg.p_b_grid_real_,
-                                           arg.p_aux_2_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_real,
+                                                   arg.p_b_grid_imag,
+                                                   arg.p_aux_grid,
+                                                   arg);
+
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_imag,
+                                                   arg.p_b_grid_real,
+                                                   arg.p_aux_2_grid,
+                                                   arg);
 
                 // c_imag = aux + aux_2
                 ave_time += launch_and_time_kernel(
                     stream_config,
                     add_kernel,
-                    dim3(grid_size),
+                    dim3(gdx, gdy, gdz),
                     dim3(BlockSize),
                     0,
-                    make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
-                    make_tuple(arg.c_grid_desc_m_),
-                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
-                               const_cast<const CDataType*>(arg.p_aux_2_grid_)),
-                    make_tuple(arg.p_c_grid_imag_),
+                    make_tuple(arg.c_grid_desc_m, arg.c_grid_desc_m),
+                    make_tuple(arg.c_grid_desc_m),
+                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid),
+                               const_cast<const CDataType*>(arg.p_aux_2_grid)),
+                    make_tuple(arg.p_c_grid_imag),
                     Add{});
             }
             else
             {
-                const auto kernel = kernel_gemm_xdl_cshuffle_v1<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    DeviceOp::AGridDesc_AK0_M_AK1,
-                    DeviceOp::BGridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::DefaultBlock2CTileMap,
-                    false>;
-
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_real_,
-                                           arg.p_b_grid_real_,
-                                           arg.p_aux_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
-
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_imag_,
-                                           arg.p_b_grid_imag_,
-                                           arg.p_aux_2_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
+                const auto kernel =
+                    kernel_gemm_xdl_cshuffle_v1<GridwiseGemm, ADataType, CDataType, false>;
+
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_real,
+                                                   arg.p_b_grid_real,
+                                                   arg.p_aux_grid,
+                                                   arg);
+
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_imag,
+                                                   arg.p_b_grid_imag,
+                                                   arg.p_aux_2_grid,
+                                                   arg);
 
                 // c_real = aux - aux_2
                 ave_time += launch_and_time_kernel(
                     stream_config,
                     subtract_kernel,
-                    dim3(grid_size),
+                    dim3(gdx, gdy, gdz),
                     dim3(BlockSize),
                     0,
-                    make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
-                    make_tuple(arg.c_grid_desc_m_),
-                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
-                               const_cast<const CDataType*>(arg.p_aux_2_grid_)),
-                    make_tuple(arg.p_c_grid_real_),
+                    make_tuple(arg.c_grid_desc_m, arg.c_grid_desc_m),
+                    make_tuple(arg.c_grid_desc_m),
+                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid),
+                               const_cast<const CDataType*>(arg.p_aux_2_grid)),
+                    make_tuple(arg.p_c_grid_real),
                     Subtract{});
 
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_real_,
-                                           arg.p_b_grid_imag_,
-                                           arg.p_aux_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
-
-                ave_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_imag_,
-                                           arg.p_b_grid_real_,
-                                           arg.p_aux_2_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_real,
+                                                   arg.p_b_grid_imag,
+                                                   arg.p_aux_grid,
+                                                   arg);
+
+                ave_time += launch_and_time_kernel(stream_config,
+                                                   kernel,
+                                                   dim3(gdx, gdy, gdz),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.p_a_grid_imag,
+                                                   arg.p_b_grid_real,
+                                                   arg.p_aux_2_grid,
+                                                   arg);
 
                 // c_imag = aux + aux_2
                 ave_time += launch_and_time_kernel(
                     stream_config,
                     add_kernel,
-                    dim3(grid_size),
+                    dim3(gdx, gdy, gdz),
                     dim3(BlockSize),
                     0,
-                    make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
-                    make_tuple(arg.c_grid_desc_m_),
-                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
-                               const_cast<const CDataType*>(arg.p_aux_2_grid_)),
-                    make_tuple(arg.p_c_grid_imag_),
+                    make_tuple(arg.c_grid_desc_m, arg.c_grid_desc_m),
+                    make_tuple(arg.c_grid_desc_m),
+                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid),
+                               const_cast<const CDataType*>(arg.p_aux_2_grid)),
+                    make_tuple(arg.p_c_grid_imag),
                     Add{});
             }
 
@@ -818,10 +448,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        return GridwiseGemm::CheckValidity(arg);
     }
 
     // polymorphic
@@ -837,15 +464,15 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                              CDataType* p_c_real,
                              CDataType* p_c_imag,
                              CDataType* p_workspace,
-                             index_t MRaw,
-                             index_t NRaw,
-                             index_t KRaw,
+                             index_t M,
+                             index_t N,
+                             index_t K,
                              index_t StrideA,
                              index_t StrideB,
                              index_t StrideC,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CElementwiseOperation)
     {
         return Argument{p_a_real,
                         p_a_imag,
@@ -854,15 +481,12 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                         p_c_real,
                         p_c_imag,
                         p_workspace,
-                        MRaw,
-                        NRaw,
-                        KRaw,
+                        M,
+                        N,
+                        K,
                         StrideA,
                         StrideB,
-                        StrideC,
-                        a_element_op,
-                        b_element_op,
-                        c_element_op};
+                        StrideC};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -875,15 +499,15 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                       void* p_c_real,
                                                       void* p_c_imag,
                                                       void* p_workspace,
-                                                      index_t MRaw,
-                                                      index_t NRaw,
-                                                      index_t KRaw,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
                                                       index_t StrideA,
                                                       index_t StrideB,
                                                       index_t StrideC,
-                                                      AElementwiseOperation a_element_op,
-                                                      BElementwiseOperation b_element_op,
-                                                      CElementwiseOperation c_element_op,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation,
                                                       index_t /* KBatch */ = 1) override
     {
         return std::make_unique<Argument>(static_cast<const ADataType*>(p_a_real),
@@ -893,15 +517,12 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                           static_cast<CDataType*>(p_c_real),
                                           static_cast<CDataType*>(p_c_imag),
                                           static_cast<CDataType*>(p_workspace),
-                                          MRaw,
-                                          NRaw,
-                                          KRaw,
+                                          M,
+                                          N,
+                                          K,
                                           StrideA,
                                           StrideB,
-                                          StrideC,
-                                          a_element_op,
-                                          b_element_op,
-                                          c_element_op);
+                                          StrideC);
     }
 
     // polymorphic
@@ -930,16 +551,22 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
         return str.str();
     }
 
-    std::size_t GetWorkspaceSize(index_t MRaw,
-                                 index_t NRaw,
-                                 [[maybe_unused]] index_t KRaw,
+    static std::size_t GetCElementSpaceSize(index_t M, index_t N, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = GridwiseGemm::MakeCGridDescriptor_M_N(
+            M, GridwiseGemm::CalculateMPadded(M), N, GridwiseGemm::CalculateNPadded(N), StrideC);
+
+        return c_grid_desc_m_n.GetElementSpaceSize();
+    }
+
+    std::size_t GetWorkspaceSize(index_t M,
+                                 index_t N,
+                                 [[maybe_unused]] index_t K,
                                  [[maybe_unused]] index_t StrideA,
                                  [[maybe_unused]] index_t StrideB,
                                  index_t StrideC) override
     {
-        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC);
-
-        return 2 * sizeof(CDataType) * c_grid_desc_m_n.GetElementSpaceSize();
+        return 2 * sizeof(CDataType) * GetCElementSpaceSize(M, N, StrideC);
     }
 };
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
index 1eaffe705a74c9cdffc9a29f912f134c2bffbd97..b6f38698cd40ddd7a5ba8f5f285126a472de6a79 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index b65afce8df5834d14f4ee9b3b19730e11db43bee..2ab09ba5c833a489ea22a3f3db06134037bdcedd 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
index ea3020663a88c1ce565beab17313e72928f50461..eb4db6f8cb098915654ecf32e9ea3c8bc1939ef0 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -379,9 +379,6 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         AccDataType,
         CDataType,
         InMemoryDataOperationEnum::Set,
-        AGridDesc_K0_M_K1,
-        BGridDesc_K0_N_K1,
-        CGridDesc_M_N,
         InElementwiseOperation,
         WeiElementwiseOperation,
         OutElementwiseOperation,
@@ -428,20 +425,10 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                  std::vector<ck::index_t> conv_filter_strides,
                  std::vector<ck::index_t> conv_filter_dilations,
                  std::vector<ck::index_t> input_left_pads,
-                 std::vector<ck::index_t> input_right_pads,
-                 ck::index_t M01,
-                 ck::index_t N01,
-                 InElementwiseOperation in_element_op,
-                 WeiElementwiseOperation wei_element_op,
-                 OutElementwiseOperation out_element_op)
+                 std::vector<ck::index_t> input_right_pads)
             : p_a_grid_{p_out_grid},
               p_b_grid_{p_wei_grid},
               p_c_grid_{p_in_grid},
-              M01_{M01},
-              N01_{N01},
-              a_element_op_{out_element_op},
-              b_element_op_{wei_element_op},
-              c_element_op_{in_element_op},
               Conv_N_{N},
               Conv_K_{K},
               Conv_C_{C},
@@ -495,18 +482,6 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                     a_grid_desc_k0_m_k1_container_.push_back(descs[I0]);
                     b_grid_desc_k0_n_k1_container_.push_back(descs[I1]);
                     c_grid_desc_m_n_container_.push_back(descs[I2]);
-
-                    auto block_2_ctile_map =
-                        GridwiseGemm::MakeDefaultBlock2CTileMap(descs[I2], M01, N01);
-
-                    if(GridwiseGemm::CheckValidity(
-                           descs[I0], descs[I1], descs[I2], block_2_ctile_map))
-                    {
-                        c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_.push_back(
-                            GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(descs[I2]));
-
-                        block_2_ctile_map_container_.push_back(block_2_ctile_map);
-                    }
                 }
             }
         }
@@ -517,14 +492,6 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         std::vector<AGridDesc_K0_M_K1> a_grid_desc_k0_m_k1_container_;
         std::vector<BGridDesc_K0_N_K1> b_grid_desc_k0_n_k1_container_;
         std::vector<CGridDesc_M_N> c_grid_desc_m_n_container_;
-        std::vector<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>
-            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_;
-        std::vector<typename GridwiseGemm::DefaultBlock2CTileMap> block_2_ctile_map_container_;
-        index_t M01_;
-        index_t N01_;
-        OutElementwiseOperation a_element_op_;
-        WeiElementwiseOperation b_element_op_;
-        InElementwiseOperation c_element_op_;
         // for checking IsSupportedArgument()
         index_t Conv_N_;
         index_t Conv_K_;
@@ -567,103 +534,68 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                               << arg.c_grid_desc_m_n_container_[i].GetLength(I0) << ", "
                               << arg.c_grid_desc_m_n_container_[i].GetLength(I1) << "}"
                               << std::endl;
-
-                    std::cout << "arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I0)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I1)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I2)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I3)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I4)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I5)
-                              << " ) " << std::endl;
                 }
 #endif
 
                 if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
                                                 arg.b_grid_desc_k0_n_k1_container_[i],
-                                                arg.c_grid_desc_m_n_container_[i],
-                                                arg.block_2_ctile_map_container_[i]))
+                                                arg.c_grid_desc_m_n_container_[i]))
                 {
                     throw std::runtime_error(
                         "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r1 has invalid setting");
                 }
 
-                const index_t grid_size = arg.block_2_ctile_map_container_[i].CalculateGridSize(
-                    arg.c_grid_desc_m_n_container_[i]);
+                const auto [gdx, gdy, gdz] =
+                    GridwiseGemm::CalculateGridSize(arg.c_grid_desc_m_n_container_[i]);
 
                 const auto K = arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I0) *
                                arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I2);
 
                 if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r3<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                        remove_reference_t<
-                            typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                        OutElementwiseOperation,
-                        WeiElementwiseOperation,
-                        InElementwiseOperation,
-                        remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                        true>;
-
-                    ave_time += launch_and_time_kernel(
-                        stream_config,
-                        kernel,
-                        dim3(grid_size),
-                        dim3(BlockSize),
-                        0,
-                        arg.p_a_grid_,
-                        arg.p_b_grid_,
-                        arg.p_c_grid_,
-                        arg.a_grid_desc_k0_m_k1_container_[i],
-                        arg.b_grid_desc_k0_n_k1_container_[i],
-                        arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i],
-                        arg.a_element_op_,
-                        arg.b_element_op_,
-                        arg.c_element_op_,
-                        arg.block_2_ctile_map_container_[i]);
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r3<GridwiseGemm,
+                                                ADataType, // TODO: distiguish A/B datatype
+                                                CDataType,
+                                                DeviceOp::AGridDesc_K0_M_K1,
+                                                DeviceOp::BGridDesc_K0_N_K1,
+                                                DeviceOp::CGridDesc_M_N,
+                                                true>;
+
+                    ave_time += launch_and_time_kernel(stream_config,
+                                                       kernel,
+                                                       dim3(gdx, gdy, gdz),
+                                                       dim3(BlockSize),
+                                                       0,
+                                                       arg.p_a_grid_,
+                                                       arg.p_b_grid_,
+                                                       arg.p_c_grid_,
+                                                       arg.a_grid_desc_k0_m_k1_container_[i],
+                                                       arg.b_grid_desc_k0_n_k1_container_[i],
+                                                       arg.c_grid_desc_m_n_container_[i]);
                 }
                 else
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r3<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                        remove_reference_t<
-                            typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                        OutElementwiseOperation,
-                        WeiElementwiseOperation,
-                        InElementwiseOperation,
-                        remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                        false>;
-
-                    ave_time += launch_and_time_kernel(
-                        stream_config,
-                        kernel,
-                        dim3(grid_size),
-                        dim3(BlockSize),
-                        0,
-                        arg.p_a_grid_,
-                        arg.p_b_grid_,
-                        arg.p_c_grid_,
-                        arg.a_grid_desc_k0_m_k1_container_[i],
-                        arg.b_grid_desc_k0_n_k1_container_[i],
-                        arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i],
-                        arg.a_element_op_,
-                        arg.b_element_op_,
-                        arg.c_element_op_,
-                        arg.block_2_ctile_map_container_[i]);
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r3<GridwiseGemm,
+                                                ADataType, // TODO: distiguish A/B datatype
+                                                CDataType,
+                                                DeviceOp::AGridDesc_K0_M_K1,
+                                                DeviceOp::BGridDesc_K0_N_K1,
+                                                DeviceOp::CGridDesc_M_N,
+                                                false>;
+
+                    ave_time += launch_and_time_kernel(stream_config,
+                                                       kernel,
+                                                       dim3(gdx, gdy, gdz),
+                                                       dim3(BlockSize),
+                                                       0,
+                                                       arg.p_a_grid_,
+                                                       arg.p_b_grid_,
+                                                       arg.p_c_grid_,
+                                                       arg.a_grid_desc_k0_m_k1_container_[i],
+                                                       arg.b_grid_desc_k0_n_k1_container_[i],
+                                                       arg.c_grid_desc_m_n_container_[i]);
                 }
             }
             return ave_time;
@@ -716,8 +648,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
                                             arg.b_grid_desc_k0_n_k1_container_[i],
-                                            arg.c_grid_desc_m_n_container_[i],
-                                            arg.block_2_ctile_map_container_[i]))
+                                            arg.c_grid_desc_m_n_container_[i]))
             {
                 return false;
             }
@@ -742,10 +673,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                              std::vector<ck::index_t> conv_filter_strides,
                              std::vector<ck::index_t> conv_filter_dilations,
                              std::vector<ck::index_t> input_left_pads,
-                             std::vector<ck::index_t> input_right_pads,
-                             InElementwiseOperation in_element_op,
-                             WeiElementwiseOperation wei_element_op,
-                             OutElementwiseOperation out_element_op)
+                             std::vector<ck::index_t> input_right_pads)
     {
         return Argument{p_in_grid,
                         p_wei_grid,
@@ -759,12 +687,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                         conv_filter_strides,
                         conv_filter_dilations,
                         input_left_pads,
-                        input_right_pads,
-                        1,
-                        1,
-                        in_element_op,
-                        wei_element_op,
-                        out_element_op};
+                        input_right_pads};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -783,9 +706,9 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                         std::vector<ck::index_t> conv_filter_dilations,
                         std::vector<ck::index_t> input_left_pads,
                         std::vector<ck::index_t> input_right_pads,
-                        InElementwiseOperation in_element_op,
-                        WeiElementwiseOperation wei_element_op,
-                        OutElementwiseOperation out_element_op) override
+                        InElementwiseOperation,
+                        WeiElementwiseOperation,
+                        OutElementwiseOperation) override
     {
         return std::make_unique<Argument>(static_cast<InDataType*>(p_in_grid),
                                           static_cast<const WeiDataType*>(p_wei_grid),
@@ -799,12 +722,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                                           conv_filter_strides,
                                           conv_filter_dilations,
                                           input_left_pads,
-                                          input_right_pads,
-                                          1,
-                                          1,
-                                          in_element_op,
-                                          wei_element_op,
-                                          out_element_op);
+                                          input_right_pads);
     }
 
     std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
index e7e2bf33540fd8dd05ba1062d32241d39ff371cd..bb8f5316134d8941fc24d17ed9a65d280353b9f8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
index 6c4957b9b17f68c3a5d33470582d1d67907a8e85..1bd1e553cd36022ff86ba37aee57157e0feca1e1 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index 027f1a1954da565721a1bf072f58e8feddcf29ca..de6bf27fec37511ae40d98e6dda7b4352e1fa139 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
index 6278220c2f9dee047f632acef0e0d2f911156fa6..88615bba3134311b4120f4aa4483050df3e084a8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -329,9 +329,6 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         AccDataType,
         CDataType,
         InMemoryDataOperationEnum::Set,
-        AGridDesc_K0_M_K1,
-        BGridDesc_K0_N_K1,
-        CGridDesc_M_N,
         InElementwiseOperation,
         WeiElementwiseOperation,
         OutElementwiseOperation,
@@ -378,25 +375,13 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                  std::vector<ck::index_t> conv_filter_strides,
                  std::vector<ck::index_t> conv_filter_dilations,
                  std::vector<ck::index_t> input_left_pads,
-                 std::vector<ck::index_t> input_right_pads,
-                 ck::index_t M01,
-                 ck::index_t N01,
-                 InElementwiseOperation in_element_op,
-                 WeiElementwiseOperation wei_element_op,
-                 OutElementwiseOperation out_element_op)
+                 std::vector<ck::index_t> input_right_pads)
             : p_a_grid_{p_in_grid},
               p_b_grid_{p_wei_grid},
               p_c_grid_{p_out_grid},
               a_grid_desc_k0_m_k1_{},
               b_grid_desc_k0_n_k1_{},
               c_grid_desc_m_n_{},
-              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
-              block_2_ctile_map_{},
-              M01_{M01},
-              N01_{N01},
-              in_element_op_{in_element_op},
-              wei_element_op_{wei_element_op},
-              out_element_op_{out_element_op},
               Conv_N_{N},
               Conv_K_{K},
               Conv_C_{C},
@@ -420,17 +405,6 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
             a_grid_desc_k0_m_k1_ = descs[I0];
             b_grid_desc_k0_n_k1_ = descs[I1];
             c_grid_desc_m_n_     = descs[I2];
-            block_2_ctile_map_ =
-                GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
-
-            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
-                                           b_grid_desc_k0_n_k1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ =
-                    GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n_);
-            }
         }
 
         //  private:
@@ -440,14 +414,6 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
         BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
         CGridDesc_M_N c_grid_desc_m_n_;
-        typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2
-            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
-        index_t M01_;
-        index_t N01_;
-        InElementwiseOperation in_element_op_;
-        WeiElementwiseOperation wei_element_op_;
-        OutElementwiseOperation out_element_op_;
         // for checking IsSupportedArgument()
         index_t Conv_N_;
         index_t Conv_K_;
@@ -479,17 +445,14 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                           << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
             }
 #endif
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                            arg.b_grid_desc_k0_n_k1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
+            if(!GridwiseGemm::CheckValidity(
+                   arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_))
             {
                 throw std::runtime_error(
                     "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
             }
 
-            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+            const auto [gdx, gdy, gdz] = GridwiseGemm::CalculateGridSize(arg.c_grid_desc_m_n_);
 
             const auto K =
                 arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
@@ -498,22 +461,18 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
 
             if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
             {
-                const auto kernel = kernel_gemm_xdlops_v2r3<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    InElementwiseOperation,
-                    WeiElementwiseOperation,
-                    OutElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                    true>;
+                const auto kernel =
+                    kernel_gemm_xdlops_v2r3<GridwiseGemm,
+                                            ADataType, // TODO: distiguish A/B datatype
+                                            CDataType,
+                                            DeviceOp::AGridDesc_K0_M_K1,
+                                            DeviceOp::BGridDesc_K0_N_K1,
+                                            DeviceOp::CGridDesc_M_N,
+                                            true>;
 
                 ave_time = launch_and_time_kernel(stream_config,
                                                   kernel,
-                                                  dim3(grid_size),
+                                                  dim3(gdx, gdy, gdz),
                                                   dim3(BlockSize),
                                                   0,
                                                   arg.p_a_grid_,
@@ -521,30 +480,22 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                                                   arg.p_c_grid_,
                                                   arg.a_grid_desc_k0_m_k1_,
                                                   arg.b_grid_desc_k0_n_k1_,
-                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                                  arg.in_element_op_,
-                                                  arg.wei_element_op_,
-                                                  arg.out_element_op_,
-                                                  arg.block_2_ctile_map_);
+                                                  arg.c_grid_desc_m_n_);
             }
             else
             {
-                const auto kernel = kernel_gemm_xdlops_v2r3<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    InElementwiseOperation,
-                    WeiElementwiseOperation,
-                    OutElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                    false>;
+                const auto kernel =
+                    kernel_gemm_xdlops_v2r3<GridwiseGemm,
+                                            ADataType, // TODO: distiguish A/B datatype
+                                            CDataType,
+                                            DeviceOp::AGridDesc_K0_M_K1,
+                                            DeviceOp::BGridDesc_K0_N_K1,
+                                            DeviceOp::CGridDesc_M_N,
+                                            false>;
 
                 ave_time = launch_and_time_kernel(stream_config,
                                                   kernel,
-                                                  dim3(grid_size),
+                                                  dim3(gdx, gdy, gdz),
                                                   dim3(BlockSize),
                                                   0,
                                                   arg.p_a_grid_,
@@ -552,11 +503,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                                                   arg.p_c_grid_,
                                                   arg.a_grid_desc_k0_m_k1_,
                                                   arg.b_grid_desc_k0_n_k1_,
-                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                                  arg.in_element_op_,
-                                                  arg.wei_element_op_,
-                                                  arg.out_element_op_,
-                                                  arg.block_2_ctile_map_);
+                                                  arg.c_grid_desc_m_n_);
             }
 
             return ave_time;
@@ -616,10 +563,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         }
 
         // Gridwise GEMM size
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                           arg.b_grid_desc_k0_n_k1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        return GridwiseGemm::CheckValidity(
+            arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
@@ -639,10 +584,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                              std::vector<ck::index_t> conv_filter_strides,
                              std::vector<ck::index_t> conv_filter_dilations,
                              std::vector<ck::index_t> input_left_pads,
-                             std::vector<ck::index_t> input_right_pads,
-                             InElementwiseOperation in_element_op,
-                             WeiElementwiseOperation wei_element_op,
-                             OutElementwiseOperation out_element_op)
+                             std::vector<ck::index_t> input_right_pads)
     {
         return Argument{p_in_grid,
                         p_wei_grid,
@@ -656,12 +598,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                         conv_filter_strides,
                         conv_filter_dilations,
                         input_left_pads,
-                        input_right_pads,
-                        1,
-                        1,
-                        in_element_op,
-                        wei_element_op,
-                        out_element_op};
+                        input_right_pads};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -680,9 +617,9 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                         std::vector<ck::index_t> conv_filter_dilations,
                         std::vector<ck::index_t> input_left_pads,
                         std::vector<ck::index_t> input_right_pads,
-                        InElementwiseOperation in_element_op,
-                        WeiElementwiseOperation wei_element_op,
-                        OutElementwiseOperation out_element_op) override
+                        InElementwiseOperation,
+                        WeiElementwiseOperation,
+                        OutElementwiseOperation) override
     {
         return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
                                           static_cast<const WeiDataType*>(p_wei_grid),
@@ -696,12 +633,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                                           conv_filter_strides,
                                           conv_filter_dilations,
                                           input_left_pads,
-                                          input_right_pads,
-                                          1,
-                                          1,
-                                          in_element_op,
-                                          wei_element_op,
-                                          out_element_op);
+                                          input_right_pads);
     }
 
     std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
index f69d8f18ae036b8e7cd060a80a0cca12bfc2a050..cd89f3232cb94e9ce9a68c06587758ac9574816f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef DEVICE_CONV3D_FWD_NAIVE_HPP
 #define DEVICE_CONV3D_FWD_NAIVE_HPP
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
index d52879cd904f05d165557c08ae6d123fbfa18434..fd8c88da9a62bcca098ff84ea56a817d0dc55059 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef DEVICE_CONV3D_FWD_XDL_HPP
 #define DEVICE_CONV3D_FWD_XDL_HPP
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
index aff25aa7c66758a0636227e0494cf0f54359dc44..3178f73f4b530e67b399cd0ae71064a4b66fa837 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
index 1fd4b76cec4cad5fd23b6a64166d210abdcb902a..77ad61d7ec5c446eed5c2171e36f2af2490d51a2 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -980,9 +980,6 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
         AccDataType,
         CDataType,
         InMemoryDataOperationEnum::Set,
-        AGridDesc_K0_M_K1,
-        BGridDesc_K0_N_K1,
-        CGridDesc_M_N,
         InElementwiseOperation,
         WeiElementwiseOperation,
         OutElementwiseOperation,
@@ -1029,20 +1026,10 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                  std::vector<ck::index_t> conv_filter_strides,
                  std::vector<ck::index_t> conv_filter_dilations,
                  std::vector<ck::index_t> input_left_pads,
-                 std::vector<ck::index_t> input_right_pads,
-                 ck::index_t M01,
-                 ck::index_t N01,
-                 InElementwiseOperation in_element_op,
-                 WeiElementwiseOperation wei_element_op,
-                 OutElementwiseOperation out_element_op)
+                 std::vector<ck::index_t> input_right_pads)
             : p_a_grid_{p_out_grid},
               p_b_grid_{p_wei_grid},
               p_c_grid_{p_in_grid},
-              M01_{M01},
-              N01_{N01},
-              a_element_op_{out_element_op},
-              b_element_op_{wei_element_op},
-              c_element_op_{in_element_op},
               Conv_N_{N},
               Conv_K_{K},
               Conv_C_{C},
@@ -1092,17 +1079,6 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                 a_grid_desc_k0_m_k1_container_.push_back(descs[I0]);
                 b_grid_desc_k0_n_k1_container_.push_back(descs[I1]);
                 c_grid_desc_m_n_container_.push_back(descs[I2]);
-
-                auto block_2_ctile_map =
-                    GridwiseGemm::MakeDefaultBlock2CTileMap(descs[I2], M01_, N01_);
-
-                if(GridwiseGemm::CheckValidity(descs[I0], descs[I1], descs[I2], block_2_ctile_map))
-                {
-                    c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_.push_back(
-                        GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(descs[I2]));
-
-                    block_2_ctile_map_container_.push_back(block_2_ctile_map);
-                }
             }
         }
         template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
@@ -1150,18 +1126,6 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                     a_grid_desc_k0_m_k1_container_.push_back(descs[I0]);
                     b_grid_desc_k0_n_k1_container_.push_back(descs[I1]);
                     c_grid_desc_m_n_container_.push_back(descs[I2]);
-
-                    auto block_2_ctile_map =
-                        GridwiseGemm::MakeDefaultBlock2CTileMap(descs[I2], M01_, N01_);
-
-                    if(GridwiseGemm::CheckValidity(
-                           descs[I0], descs[I1], descs[I2], block_2_ctile_map))
-                    {
-                        c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_.push_back(
-                            GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(descs[I2]));
-
-                        block_2_ctile_map_container_.push_back(block_2_ctile_map);
-                    }
                 }
             }
         }
@@ -1218,19 +1182,6 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                         a_grid_desc_k0_m_k1_container_.push_back(descs[I0]);
                         b_grid_desc_k0_n_k1_container_.push_back(descs[I1]);
                         c_grid_desc_m_n_container_.push_back(descs[I2]);
-
-                        auto block_2_ctile_map =
-                            GridwiseGemm::MakeDefaultBlock2CTileMap(descs[I2], M01_, N01_);
-
-                        if(GridwiseGemm::CheckValidity(
-                               descs[I0], descs[I1], descs[I2], block_2_ctile_map))
-                        {
-                            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_.push_back(
-                                GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(
-                                    descs[I2]));
-
-                            block_2_ctile_map_container_.push_back(block_2_ctile_map);
-                        }
                     }
                 }
             }
@@ -1242,11 +1193,6 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
         std::vector<AGridDesc_K0_M_K1> a_grid_desc_k0_m_k1_container_;
         std::vector<BGridDesc_K0_N_K1> b_grid_desc_k0_n_k1_container_;
         std::vector<CGridDesc_M_N> c_grid_desc_m_n_container_;
-        std::vector<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>
-            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_;
-        std::vector<typename GridwiseGemm::DefaultBlock2CTileMap> block_2_ctile_map_container_;
-        index_t M01_;
-        index_t N01_;
         OutElementwiseOperation a_element_op_;
         WeiElementwiseOperation b_element_op_;
         InElementwiseOperation c_element_op_;
@@ -1276,123 +1222,84 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
             {
 #if DEBUG_LOG
                 {
-                    std::cout << "arg.a_grid_desc_k0_m_k1_container_{"
+                    std::cout << "arg.a_grid_desc_k0_m_k1{"
                               << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I0) << ", "
                               << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I1) << ", "
                               << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I2) << "}"
                               << std::endl;
 
-                    std::cout << "arg.b_grid_desc_k0_n_k1_container_{"
+                    std::cout << "arg.b_grid_desc_k0_n_k1{"
                               << arg.b_grid_desc_k0_n_k1_container_[i].GetLength(I0) << ", "
                               << arg.b_grid_desc_k0_n_k1_container_[i].GetLength(I1) << ", "
                               << arg.b_grid_desc_k0_n_k1_container_[i].GetLength(I2) << "}"
                               << std::endl;
 
-                    std::cout << "arg.c_grid_desc_m_n_container_{ "
+                    std::cout << "arg.c_grid_desc_m_n{"
                               << arg.c_grid_desc_m_n_container_[i].GetLength(I0) << ", "
                               << arg.c_grid_desc_m_n_container_[i].GetLength(I1) << "}"
                               << std::endl;
-
-                    std::cout << "arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I0)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I1)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I2)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I3)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I4)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I5)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I6)
-                              << ", "
-                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I7)
-                              << " ) " << std::endl;
                 }
 #endif
 
                 if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
                                                 arg.b_grid_desc_k0_n_k1_container_[i],
-                                                arg.c_grid_desc_m_n_container_[i],
-                                                arg.block_2_ctile_map_container_[i]))
+                                                arg.c_grid_desc_m_n_container_[i]))
                 {
                     throw std::runtime_error(
-                        "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r1 has invalid setting");
+                        "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
                 }
 
-                const index_t grid_size = arg.block_2_ctile_map_container_[i].CalculateGridSize(
-                    arg.c_grid_desc_m_n_container_[i]);
+                const auto [gdx, gdy, gdz] =
+                    GridwiseGemm::CalculateGridSize(arg.c_grid_desc_m_n_container_[i]);
 
                 const auto K = arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I0) *
                                arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I2);
 
                 if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r3<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                        remove_reference_t<
-                            typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                        OutElementwiseOperation,
-                        WeiElementwiseOperation,
-                        InElementwiseOperation,
-                        remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                        true>;
-
-                    ave_time += launch_and_time_kernel(
-                        stream_config,
-                        kernel,
-                        dim3(grid_size),
-                        dim3(BlockSize),
-                        0,
-                        arg.p_a_grid_,
-                        arg.p_b_grid_,
-                        arg.p_c_grid_,
-                        arg.a_grid_desc_k0_m_k1_container_[i],
-                        arg.b_grid_desc_k0_n_k1_container_[i],
-                        arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i],
-                        arg.a_element_op_,
-                        arg.b_element_op_,
-                        arg.c_element_op_,
-                        arg.block_2_ctile_map_container_[i]);
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r3<GridwiseGemm,
+                                                ADataType, // TODO: distiguish A/B datatype
+                                                CDataType,
+                                                DeviceOp::AGridDesc_K0_M_K1,
+                                                DeviceOp::BGridDesc_K0_N_K1,
+                                                DeviceOp::CGridDesc_M_N,
+                                                true>;
+
+                    ave_time += launch_and_time_kernel(stream_config,
+                                                       kernel,
+                                                       dim3(gdx, gdy, gdz),
+                                                       dim3(BlockSize),
+                                                       0,
+                                                       arg.p_a_grid_,
+                                                       arg.p_b_grid_,
+                                                       arg.p_c_grid_,
+                                                       arg.a_grid_desc_k0_m_k1_container_[i],
+                                                       arg.b_grid_desc_k0_n_k1_container_[i],
+                                                       arg.c_grid_desc_m_n_container_[i]);
                 }
                 else
                 {
-                    const auto kernel = kernel_gemm_xdlops_v2r3<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                        remove_reference_t<
-                            typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                        OutElementwiseOperation,
-                        WeiElementwiseOperation,
-                        InElementwiseOperation,
-                        remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                        false>;
-
-                    ave_time += launch_and_time_kernel(
-                        stream_config,
-                        kernel,
-                        dim3(grid_size),
-                        dim3(BlockSize),
-                        0,
-                        arg.p_a_grid_,
-                        arg.p_b_grid_,
-                        arg.p_c_grid_,
-                        arg.a_grid_desc_k0_m_k1_container_[i],
-                        arg.b_grid_desc_k0_n_k1_container_[i],
-                        arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i],
-                        arg.a_element_op_,
-                        arg.b_element_op_,
-                        arg.c_element_op_,
-                        arg.block_2_ctile_map_container_[i]);
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r3<GridwiseGemm,
+                                                ADataType, // TODO: distiguish A/B datatype
+                                                CDataType,
+                                                DeviceOp::AGridDesc_K0_M_K1,
+                                                DeviceOp::BGridDesc_K0_N_K1,
+                                                DeviceOp::CGridDesc_M_N,
+                                                false>;
+
+                    ave_time += launch_and_time_kernel(stream_config,
+                                                       kernel,
+                                                       dim3(gdx, gdy, gdz),
+                                                       dim3(BlockSize),
+                                                       0,
+                                                       arg.p_a_grid_,
+                                                       arg.p_b_grid_,
+                                                       arg.p_c_grid_,
+                                                       arg.a_grid_desc_k0_m_k1_container_[i],
+                                                       arg.b_grid_desc_k0_n_k1_container_[i],
+                                                       arg.c_grid_desc_m_n_container_[i]);
                 }
             }
             return ave_time;
@@ -1446,8 +1353,7 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
                                             arg.b_grid_desc_k0_n_k1_container_[i],
-                                            arg.c_grid_desc_m_n_container_[i],
-                                            arg.block_2_ctile_map_container_[i]))
+                                            arg.c_grid_desc_m_n_container_[i]))
             {
                 return false;
             }
@@ -1472,10 +1378,7 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                              std::vector<ck::index_t> conv_filter_strides,
                              std::vector<ck::index_t> conv_filter_dilations,
                              std::vector<ck::index_t> input_left_pads,
-                             std::vector<ck::index_t> input_right_pads,
-                             InElementwiseOperation in_element_op,
-                             WeiElementwiseOperation wei_element_op,
-                             OutElementwiseOperation out_element_op)
+                             std::vector<ck::index_t> input_right_pads)
     {
         return Argument{p_in_grid,
                         p_wei_grid,
@@ -1489,12 +1392,7 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                         conv_filter_strides,
                         conv_filter_dilations,
                         input_left_pads,
-                        input_right_pads,
-                        1,
-                        1,
-                        in_element_op,
-                        wei_element_op,
-                        out_element_op};
+                        input_right_pads};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -1513,9 +1411,9 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                         std::vector<ck::index_t> conv_filter_dilations,
                         std::vector<ck::index_t> input_left_pads,
                         std::vector<ck::index_t> input_right_pads,
-                        InElementwiseOperation in_element_op,
-                        WeiElementwiseOperation wei_element_op,
-                        OutElementwiseOperation out_element_op) override
+                        InElementwiseOperation,
+                        WeiElementwiseOperation,
+                        OutElementwiseOperation) override
     {
         return std::make_unique<Argument>(static_cast<InDataType*>(p_in_grid),
                                           static_cast<const WeiDataType*>(p_wei_grid),
@@ -1529,12 +1427,7 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                                           conv_filter_strides,
                                           conv_filter_dilations,
                                           input_left_pads,
-                                          input_right_pads,
-                                          1,
-                                          1,
-                                          in_element_op,
-                                          wei_element_op,
-                                          out_element_op);
+                                          input_right_pads);
     }
 
     std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp
index 83ed6198bd3c0921988d0d867a87a617bc2c76a1..c5f90e40fe8e54f3fa04c71354002a54fecb5ab8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp
index a11b5d0398774f29464e4e429597d46965aba882..5618fba5171e8bfe9f270e9fc9cf1f5b23aadbf6 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp
index 1fa69288a4d725304181698af5b13f848b6fa5c7..c3416758d1277d6cc7d69e4aa4115b57af4c9cd1 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
index a9a58c8ac425876e8b5be1bbe3a9f1ff0c08dcd0..63f7fa706870ee65cdff0e7db2c70b4447e209bf 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
index 36366a76315902a28a56311e5ff98d713bbda5d8..13e9f96910fadeb0fdda4ef723df3f260bc7ee7b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
index c91328ff713f7eb7a1b54ef3bf923c28911751ee..22f66931f02881a89f0e3b99c5785c363eca0203 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
index 580087e00c6416b6b068740ced2c4ab9b2a4998a..1ab836247270d1fe93bca27273a0e6208bd03eb0 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
index 4c1c3ab7bf1d10d5f8f75f88c41064db1fb51481..be174e599e61a4e012d10338b597867e3e892f1b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
index 750df31a3f6395ccf52fc27ac7705311b24e4376..44b3518e2c950e15ec3d5cfd240c443029987622 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
index 2488101484a6d3674be42afe79b836d61058a58a..bd5be99f8b11e89f8713ee79bb1bf2b813a4667b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp
index f358bd176fe16f25a55b02bd19591f9e1983d708..e60fae370f7b4932b7952dbbe3dced5caca4a824 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
index 03ffcf8e59c210726dc883b85cf021ac4b3ccc6b..f64450b75f272dbb0f9ef4066e055a2f59c3879e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
index a5051455bf18502afd366e04c6ca9cf4b6103826..ebbb1d38c000ba44ce53f9a8110114c41a299b14 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -75,132 +75,20 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
 
     static constexpr auto K1Number = Number<K1>{};
 
-    static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
-    {
-        const index_t K0 = K / K1;
-
-        const auto a_grid_desc_m_k = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
-            }
-        }();
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-
-            return transform_tensor_descriptor(
-                a_grid_desc_m_k,
-                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                           make_right_pad_transform(M, PadM)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                a_grid_desc_m_k,
-                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                           make_pass_through_transform(M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-        }
-    }
-
-    static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
-    {
-        const index_t K0 = K / K1;
-
-        const auto b_grid_desc_k_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
-            }
-        }();
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-
-            return transform_tensor_descriptor(
-                b_grid_desc_k_n,
-                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                           make_right_pad_transform(N, PadN)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                b_grid_desc_k_n,
-                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                           make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-        }
-    }
-
-    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
-    {
-        const auto c_grid_desc_m_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
-            }
-        }();
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-    }
-
-    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1));
-    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1));
-    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
-
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext<
         BlockSize,
         ADataType, // TODO: distinguish A/B datatype
         AccDataType,
         CDataType,
         InMemoryDataOperationEnum::Set,
-        AGridDesc_K0_M_K1,
-        BGridDesc_K0_N_K1,
-        CGridDesc_M_N,
+        ALayout,
+        BLayout,
+        CLayout,
         AElementwiseOperation,
         BElementwiseOperation,
         CElementwiseOperation,
+        GemmSpec,
         MPerBlock,
         NPerBlock,
         K0PerBlock,
@@ -232,173 +120,41 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
         LoopSched,
         PipelineVer>;
 
-    // Argument
-    struct Argument : public BaseArgument
-    {
-        Argument(const ADataType* p_a_grid,
-                 const BDataType* p_b_grid,
-                 CDataType* p_c_grid,
-                 index_t M,
-                 index_t N,
-                 index_t K,
-                 index_t StrideA,
-                 index_t StrideB,
-                 index_t StrideC,
-                 index_t M01,
-                 index_t N01,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-            : p_a_grid_{p_a_grid},
-              p_b_grid_{p_b_grid},
-              p_c_grid_{p_c_grid},
-              a_grid_desc_k0_m_k1_{},
-              b_grid_desc_k0_n_k1_{},
-              c_grid_desc_m_n_{},
-              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
-              block_2_ctile_map_{},
-              M01_{M01},
-              N01_{N01},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op},
-              kraw_{K}
-        {
-            a_grid_desc_k0_m_k1_ = DeviceGemmXdl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA);
-            b_grid_desc_k0_n_k1_ = DeviceGemmXdl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB);
-            c_grid_desc_m_n_     = DeviceGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC);
-
-            block_2_ctile_map_ =
-                GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
-
-            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
-                                           b_grid_desc_k0_n_k1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ =
-                    GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n_);
-            }
-        }
-
-        //  private:
-        const ADataType* p_a_grid_;
-        const BDataType* p_b_grid_;
-        CDataType* p_c_grid_;
-        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
-        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
-        CGridDesc_M_N c_grid_desc_m_n_;
-        typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2
-            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
-        index_t M01_;
-        index_t N01_;
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
-        index_t kraw_;
-    };
+    using Argument = typename GridwiseGemm::Argument;
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        using Argument = DeviceGemmXdl::Argument;
-
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        float Run(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
         {
-#if DEBUG_LOG
+            if(stream_config.log_level_ > 0)
             {
-                std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
-                          << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
-                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0)
-                          << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
-                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
-                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+                karg.Print();
             }
-#endif
 
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                            arg.b_grid_desc_k0_n_k1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
+            if(!GridwiseGemm::CheckValidity(karg))
             {
                 throw std::runtime_error(
-                    "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
+                    "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext has invalid setting");
             }
 
-            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
-
-            const auto K =
-                arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
+            const auto [gdx, gdy, gdz] = GridwiseGemm::CalculateGridSize(karg.M, karg.N);
 
             float ave_time = 0;
 
-            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(karg.K))
             {
-                const auto kernel = kernel_gemm_xdlops_v2r3<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    remove_reference_t<DeviceGemmXdl::AGridDesc_K0_M_K1>,
-                    remove_reference_t<DeviceGemmXdl::BGridDesc_K0_N_K1>,
-                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                    true>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(BlockSize),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.a_grid_desc_k0_m_k1_,
-                                                  arg.b_grid_desc_k0_n_k1_,
-                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                                  arg.a_element_op_,
-                                                  arg.b_element_op_,
-                                                  arg.c_element_op_,
-                                                  arg.block_2_ctile_map_);
+                const auto kernel = kernel_gemm_xdlops_v2r3<GridwiseGemm, true>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg);
             }
             else
             {
-                const auto kernel = kernel_gemm_xdlops_v2r3<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    remove_reference_t<DeviceGemmXdl::AGridDesc_K0_M_K1>,
-                    remove_reference_t<DeviceGemmXdl::BGridDesc_K0_N_K1>,
-                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
-                    false>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(BlockSize),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.a_grid_desc_k0_m_k1_,
-                                                  arg.b_grid_desc_k0_n_k1_,
-                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                                  arg.a_element_op_,
-                                                  arg.b_element_op_,
-                                                  arg.c_element_op_,
-                                                  arg.block_2_ctile_map_);
+                const auto kernel = kernel_gemm_xdlops_v2r3<GridwiseGemm, false>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg);
             }
 
             return ave_time;
@@ -418,7 +174,7 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
         return true;
     }
 
-    static bool IsSupportedArgument(const Argument& arg)
+    static bool IsSupportedArgument(const Argument& karg)
     {
         if(ck::get_device_name() == "gfx908")
         {
@@ -441,15 +197,12 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
             return false;
         }
 
-        if(arg.kraw_ % K1 != 0)
+        if(karg.K % K1 != 0)
         {
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
-                                           arg.b_grid_desc_k0_n_k1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        return GridwiseGemm::CheckValidity(karg);
     }
 
     // polymorphic
@@ -467,24 +220,11 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
                              index_t StrideA,
                              index_t StrideB,
                              index_t StrideC,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CElementwiseOperation)
     {
-        return Argument{p_a,
-                        p_b,
-                        p_c,
-                        M,
-                        N,
-                        K,
-                        StrideA,
-                        StrideB,
-                        StrideC,
-                        1,
-                        1,
-                        a_element_op,
-                        b_element_op,
-                        c_element_op};
+        return Argument{p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -499,9 +239,9 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
                                                       index_t StrideA,
                                                       index_t StrideB,
                                                       index_t StrideC,
-                                                      AElementwiseOperation a_element_op,
-                                                      BElementwiseOperation b_element_op,
-                                                      CElementwiseOperation c_element_op) override
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation) override
     {
         return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                           static_cast<const BDataType*>(p_b),
@@ -511,12 +251,7 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
                                           K,
                                           StrideA,
                                           StrideB,
-                                          StrideC,
-                                          1,
-                                          1,
-                                          a_element_op,
-                                          b_element_op,
-                                          c_element_op);
+                                          StrideC);
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
index 7cd0ff72e8a8a005921abf32f0443ee3e581e729..a3dda82aa6c8e801063661cfc3a76348bc053e40 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -82,276 +82,11 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
 
-    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
-    {
-        const auto a_grid_desc_mraw_kraw = [&]() {
-            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
-                                                    make_tuple(StrideA, I1));
-            }
-            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
-                                                    make_tuple(I1, StrideA));
-            }
-        }();
-
-        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
-        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
-
-        const auto MPad = M - MRaw;
-        const auto KPad = K - KRaw;
-
-        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad both M and K
-            assert(K % AK1 == 0);
-
-            const auto AK0 = K / AK1;
-
-            const auto a_grid_desc_m_k =
-                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
-                                            make_tuple(make_right_pad_transform(MRaw, MPad),
-                                                       make_right_pad_transform(KRaw, KPad)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_m_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_pass_through_transform(M)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MNPadding)
-        {
-            // pad M, but not K
-            assert(KRaw % AK1 == 0);
-
-            const auto AK0 = KRaw / AK1;
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_right_pad_transform(MRaw, MPad)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad K, but not M
-            assert(K % AK1 == 0);
-
-            const auto AK0 = K / AK1;
-
-            const auto a_grid_desc_m_k = transform_tensor_descriptor(
-                a_grid_desc_mraw_kraw,
-                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_m_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_pass_through_transform(MRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else
-        {
-            // not pad M or K
-            assert(KRaw % AK1 == 0);
-
-            const auto AK0 = KRaw / AK1;
-
-            const auto a_grid_desc_ak0_m_ak1 =
-                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
-                                                       make_pass_through_transform(MRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-    }
-
-    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
-    {
-        const auto b_grid_desc_nraw_kraw = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
-                                                    make_tuple(I1, StrideB));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
-                                                    make_tuple(StrideB, I1));
-            }
-        }();
-
-        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
-        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
-
-        const auto NPad = N - NRaw;
-        const auto KPad = K - KRaw;
-
-        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad both N and K
-            assert(K % BK1 == 0);
-
-            const auto BK0 = K / BK1;
-
-            const auto b_grid_desc_n_k =
-                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
-                                            make_tuple(make_right_pad_transform(NRaw, NPad),
-                                                       make_right_pad_transform(KRaw, KPad)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_n_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(N)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::MNPadding)
-        {
-            // pad N, but not K
-            assert(KRaw % BK1 == 0);
-
-            const auto BK0 = KRaw / BK1;
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_right_pad_transform(NRaw, NPad)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad K, but not N
-            assert(K % BK1 == 0);
-
-            const auto BK0 = K / BK1;
-
-            const auto b_grid_desc_n_k = transform_tensor_descriptor(
-                b_grid_desc_nraw_kraw,
-                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_n_k,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(NRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else
-        {
-            // not pad N or K
-            assert(KRaw % BK1 == 0);
-
-            const auto BK0 = KRaw / BK1;
-
-            const auto b_grid_desc_bk0_n_bk1 =
-                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(NRaw)),
-                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-    }
-
-    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
-    {
-        const auto c_grid_desc_mraw_nraw = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
-                                                    make_tuple(StrideC, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
-                                                    make_tuple(I1, StrideC));
-            }
-        }();
-
-        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
-        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
-
-        const auto MPad = M - MRaw;
-        const auto NPad = N - NRaw;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(MRaw, MPad),
-                                                          make_right_pad_transform(NRaw, NPad)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-    }
-
-    using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
-    using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
-    using CGridDesc_M_N       = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
-
     // GridwiseGemm
     using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ALayout,
+        BLayout,
+        CLayout,
         ADataType, // TODO: distinguish A/B datatype
         GemmAccDataType,
         CShuffleDataType,
@@ -359,10 +94,8 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
         AElementwiseOperation,
         BElementwiseOperation,
         CElementwiseOperation,
+        GemmSpec,
         InMemoryDataOperationEnum::Set,
-        AGridDesc_AK0_M_AK1,
-        BGridDesc_BK0_N_BK1,
-        CGridDesc_M_N,
         NumGemmKPrefetchStage,
         BlockSize,
         MPerBlock,
@@ -397,162 +130,43 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
         LoopSched,
         PipelineVer>;
 
-    // Argument
-    struct Argument : public BaseArgument
-    {
-        Argument(const ADataType* p_a_grid,
-                 const BDataType* p_b_grid,
-                 CDataType* p_c_grid,
-                 index_t MRaw,
-                 index_t NRaw,
-                 index_t KRaw,
-                 index_t StrideA,
-                 index_t StrideB,
-                 index_t StrideC,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-            : p_a_grid_{p_a_grid},
-              p_b_grid_{p_b_grid},
-              p_c_grid_{p_c_grid},
-              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
-              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
-              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
-              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op},
-              kraw_{KRaw}
-        {
-            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
-                                           b_grid_desc_bk0_n_bk1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        c_grid_desc_m_n_);
-            }
-        }
-
-        //  private:
-        const ADataType* p_a_grid_;
-        const BDataType* p_b_grid_;
-        CDataType* p_c_grid_;
-        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
-        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        CGridDesc_M_N c_grid_desc_m_n_;
-        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-            c_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
-        index_t kraw_;
-    };
+    using Argument = typename GridwiseGemm::Argument;
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        using Argument = DeviceOp::Argument;
-
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-#if DEBUG_LOG
+            if(stream_config.log_level_ > 0)
             {
-                std::cout << "arg.a_grid_desc_ak0_m_ak1_{"
-                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) << ", "
-                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) << ", "
-                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.b_grid_desc_bk0_n_bk1_{"
-                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I0) << ", "
-                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) << ", "
-                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
-                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+                arg.Print();
             }
-#endif
 
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                            arg.b_grid_desc_bk0_n_bk1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
+            if(!GridwiseGemm::CheckValidity(arg))
             {
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
 
-            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
-            const auto K =
-                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N);
+
+            const auto K = GridwiseGemm::CalculateAK0(arg.K) * AK1;
 
             float ave_time = 0;
 
             if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
             {
-                const auto kernel = kernel_gemm_xdl_cshuffle_v1<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    DeviceOp::AGridDesc_AK0_M_AK1,
-                    DeviceOp::BGridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::DefaultBlock2CTileMap,
-                    true>;
-
-                ave_time =
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_,
-                                           arg.p_b_grid_,
-                                           arg.p_c_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<GridwiseGemm, true>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
             }
             else
             {
-                const auto kernel = kernel_gemm_xdl_cshuffle_v1<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    CDataType,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CElementwiseOperation,
-                    DeviceOp::AGridDesc_AK0_M_AK1,
-                    DeviceOp::BGridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::DefaultBlock2CTileMap,
-                    false>;
-                ave_time =
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_,
-                                           arg.p_b_grid_,
-                                           arg.p_c_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_);
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<GridwiseGemm, false>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
             }
 
             return ave_time;
@@ -580,19 +194,15 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
             return false;
         }
 
-        if((arg.kraw_ % AK1 != 0 || arg.kraw_ % BK1 != 0) &&
-           !(GemmSpec == GemmSpecialization::MKPadding ||
-             GemmSpec == GemmSpecialization::NKPadding ||
-             GemmSpec == GemmSpecialization::MNKPadding ||
-             GemmSpec == GemmSpecialization::KPadding))
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
         {
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
-                                           arg.b_grid_desc_bk0_n_bk1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        return GridwiseGemm::CheckValidity(arg);
     }
 
     // polymorphic
@@ -604,28 +214,17 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
     static auto MakeArgument(const ADataType* p_a,
                              const BDataType* p_b,
                              CDataType* p_c,
-                             index_t MRaw,
-                             index_t NRaw,
-                             index_t KRaw,
+                             index_t M,
+                             index_t N,
+                             index_t K,
                              index_t StrideA,
                              index_t StrideB,
                              index_t StrideC,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CElementwiseOperation)
     {
-        return Argument{p_a,
-                        p_b,
-                        p_c,
-                        MRaw,
-                        NRaw,
-                        KRaw,
-                        StrideA,
-                        StrideB,
-                        StrideC,
-                        a_element_op,
-                        b_element_op,
-                        c_element_op};
+        return Argument{p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -634,28 +233,25 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
     std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
                                                       const void* p_b,
                                                       void* p_c,
-                                                      index_t MRaw,
-                                                      index_t NRaw,
-                                                      index_t KRaw,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
                                                       index_t StrideA,
                                                       index_t StrideB,
                                                       index_t StrideC,
-                                                      AElementwiseOperation a_element_op,
-                                                      BElementwiseOperation b_element_op,
-                                                      CElementwiseOperation c_element_op) override
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation) override
     {
         return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                           static_cast<const BDataType*>(p_b),
                                           static_cast<CDataType*>(p_c),
-                                          MRaw,
-                                          NRaw,
-                                          KRaw,
+                                          M,
+                                          N,
+                                          K,
                                           StrideA,
                                           StrideB,
-                                          StrideC,
-                                          a_element_op,
-                                          b_element_op,
-                                          c_element_op);
+                                          StrideC);
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
index 8ee138f8277295cc2d87c17b9a087c54eed4bc64..14ac5420ae5e4caec36bdb5897adcda753997577 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
index 36b01f677f080e4f2f515738af6171d6ed4dbbcd..ef5e267819c7fd8ca3868f689fd7bf16dd50fbb8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
index 776f96e8e6d98aba3d59a46eafbc681a40e7b615..3801993412e7bf9e58c4ed14249d7db52606a464 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -73,6 +73,11 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
     static constexpr auto I2 = Number<2>{};
     static constexpr auto I3 = Number<3>{};
 
+    // TODO: should be exposed as Tparams.
+    static constexpr index_t NumGemmKPrefetchStage = 1;
+    static constexpr LoopScheduler LoopSched       = make_default_loop_scheduler();
+    static constexpr PipelineVersion PipelineVer   = PipelineVersion::v2;
+
     using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
         BlockSize,
         ADataType, // TODO: distinguish A/B datatype
@@ -85,6 +90,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
         BElementwiseOperation,
         CElementwiseOperation,
         GemmSpec,
+        NumGemmKPrefetchStage,
         MPerBlock,
         NPerBlock,
         K0PerBlock,
@@ -112,7 +118,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
         CShuffleMRepeatPerShuffle,
         CShuffleNRepeatPerShuffle,
         CBlockTransferScalarPerVector_NWaveNPerXDL,
-        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        LoopSched,
+        PipelineVer>;
 
     using Argument              = typename GridwiseGemm::Argument;
     using DefaultBlock2CTileMap = typename GridwiseGemm::DefaultBlock2CTileMap;
@@ -257,7 +265,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                         StrideC,
                         GridwiseGemm::CalculateMPadded(M),
                         GridwiseGemm::CalculateNPadded(N),
-                        GridwiseGemm::CalculateKPadded(K),
+                        GridwiseGemm::CalculateKPadded(K, KBatch),
                         GridwiseGemm::CalculateK0(K, KBatch),
                         KBatch};
     }
@@ -290,7 +298,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                           StrideC,
                                           GridwiseGemm::CalculateMPadded(M),
                                           GridwiseGemm::CalculateNPadded(N),
-                                          GridwiseGemm::CalculateKPadded(K),
+                                          GridwiseGemm::CalculateKPadded(K, KBatch),
                                           GridwiseGemm::CalculateK0(K, KBatch),
                                           KBatch);
     }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
index 76dd5a36664eea74835146590faa8956d33f4a46..5775ff3971982e3098c64df2a1fe9c5a1c09bcef 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -652,11 +652,12 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
                 }
             }
 
-            hipGetErrorString(hipMemcpy(arg.p_workspace_,
-                                        arg.contraction_multi_d_kernel_args_.data(),
-                                        arg.contraction_multi_d_kernel_args_.size() *
-                                            sizeof(ContractionMultiDKernelArg),
-                                        hipMemcpyHostToDevice));
+            hipGetErrorString(hipMemcpyWithStream(arg.p_workspace_,
+                                                  arg.contraction_multi_d_kernel_args_.data(),
+                                                  arg.contraction_multi_d_kernel_args_.size() *
+                                                      sizeof(ContractionMultiDKernelArg),
+                                                  hipMemcpyHostToDevice,
+                                                  stream_config.stream_id_));
 
             float ave_time = 0;
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 92c20a308f9bc3b8c5f98581ce3e627d30420ebb..3f6238d21f5698c1248b74b27cb4fa5d02636370 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
index c921c9f1b400f76627376d4865498aa27e7b49c3..3bcc7bd64b83d09df88999c54cdcb35cbcfc70ad 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp
index 03185d5b1d29186f2b2814257eacfe90da6c5e02..face627e1fb85d0621823803eabc5409b0e73d61 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
index de40d71293038b281d1b4c119140883e1c38ac7f..71e4e28bf29c1445ac31fee5cb9d43266568b02f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
index 9d4b68c0b6429aacd16106d1088db5e1c1f74d4f..526108b87442b4077bc9723820edcd09b7fd76f3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
index e88bf8ed758cb685feb0b51ee62cdbf7c7189ad7..22be58259adfc839c122a7d8a1769f72897ad8e9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
@@ -1,6 +1,6 @@
 #pragma once
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -597,10 +597,12 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
                 }
             }
 
-            hipGetErrorString(hipMemcpy(arg.p_workspace_,
-                                        arg.gemm_desc_kernel_arg_.data(),
-                                        arg.gemm_desc_kernel_arg_.size() * sizeof(GemmKernelArg),
-                                        hipMemcpyHostToDevice));
+            hipGetErrorString(
+                hipMemcpyWithStream(arg.p_workspace_,
+                                    arg.gemm_desc_kernel_arg_.data(),
+                                    arg.gemm_desc_kernel_arg_.size() * sizeof(GemmKernelArg),
+                                    hipMemcpyHostToDevice,
+                                    stream_config.stream_id_));
 
             auto launch_kernel = [&](auto has_main_k_block_loop,
                                      auto has_double_tail_k_block_loop) {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
index e3795060be140b9e6a53cac1dceff19a2d132ff1..3900047567021b9f1743e9645466e6357fdbf534 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -1,6 +1,6 @@
 #pragma once
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -548,11 +548,12 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
                 }
             }
 
-            hipGetErrorString(
-                hipMemcpy(arg.p_workspace_,
-                          arg.gemm_desc_kernel_arg_.data(),
-                          arg.gemm_desc_kernel_arg_.size() * sizeof(GemmBiasTransKernelArg),
-                          hipMemcpyHostToDevice));
+            hipGetErrorString(hipMemcpyWithStream(arg.p_workspace_,
+                                                  arg.gemm_desc_kernel_arg_.data(),
+                                                  arg.gemm_desc_kernel_arg_.size() *
+                                                      sizeof(GemmBiasTransKernelArg),
+                                                  hipMemcpyHostToDevice,
+                                                  stream_config.stream_id_));
 
             float ave_time = 0;
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index 467a8429ab4812fd377bc51f21eb9e13d03a064b..1ac9969f86d91ef549664433724ebbd57dae3bc8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -85,7 +85,7 @@ template <typename ALayout,
           typename BElementwiseOperation,
           typename CDEElementwiseOperation,
           GemmSpecialization GemmSpec,
-          ck::index_t NumPrefetch,
+          ck::index_t NumGemmKPrefetchStage,
           ck::index_t BlockSize,
           ck::index_t MPerBlock,
           ck::index_t NPerBlock,
@@ -152,6 +152,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         BElementwiseOperation,
         CDEElementwiseOperation,
         GemmSpec,
+        NumGemmKPrefetchStage,
         MPerBlock,
         NPerBlock,
         K0PerBlock,
@@ -179,7 +180,9 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         CShuffleMXdlPerWavePerShuffle,
         CShuffleNXdlPerWavePerShuffle,
         CDEBlockTransferScalarPerVector_NPerBlock,
-        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        LoopSched,
+        PipelineVersion::v2>;
 
     using CGridDesc_M_N = typename GridwiseGemm::CGridDesc_M_N;
     using Block2ETileMapKSplit =
@@ -265,8 +268,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 const index_t k_padded = GridwiseGemm::CalculateKPadded(K, K_BATCH);
                 const index_t k0       = GridwiseGemm::CalculateK0(K, K_BATCH);
 
-                const auto c_grid_desc_m_n =
-                    GridwiseGemm::MakeCGridDescriptor_M_N(M, N, m_padded, n_padded, stride_c);
+                const auto c_grid_desc_m_n = GridwiseGemm::MakeCGridDescriptor_M_N(M, N, stride_c);
 
                 const auto local_b2c_tile_map =
                     Block2ETileMapKSplit{c_grid_desc_m_n, B2E_M01, K_BATCH};
@@ -319,8 +321,8 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 const index_t k_padded = GridwiseGemm::CalculateKPadded(karg.K, K_BATCH);
                 const index_t k0       = GridwiseGemm::CalculateK0(karg.K, K_BATCH);
 
-                const auto c_grid_desc_m_n = GridwiseGemm::MakeCGridDescriptor_M_N(
-                    karg.M, karg.N, karg.MPadded, karg.NPadded, karg.StrideC);
+                const auto c_grid_desc_m_n =
+                    GridwiseGemm::MakeCGridDescriptor_M_N(karg.M, karg.N, karg.StrideC);
 
                 const auto local_b2c_tile_map =
                     Block2ETileMapKSplit{c_grid_desc_m_n, B2E_M01, K_BATCH};
@@ -404,10 +406,12 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 }
             }
 
-            hip_check_error(hipMemcpy(arg.p_workspace_,
-                                      arg.gemm_kernel_args_.data(),
-                                      arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
-                                      hipMemcpyHostToDevice));
+            hip_check_error(
+                hipMemcpyWithStream(arg.p_workspace_,
+                                    arg.gemm_kernel_args_.data(),
+                                    arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
+                                    hipMemcpyHostToDevice,
+                                    stream_config.stream_id_));
 
             float ave_time = 0;
 
@@ -501,6 +505,11 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         if((ck::type_convert<ck::index_t>(arg.gemm_kernel_args_.size()) +
             arg.skipped_group_count_) != arg.group_count_)
         {
+#if DEBUG_LOG
+            std::cout << "The group count is not equal to sum of skipped groups "
+                         "and kernel args size!"
+                      << std::endl;
+#endif // DEBUG_LOG
             return false;
         }
 
@@ -509,14 +518,15 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         {
             const auto& a        = arg.gemm_kernel_args_[i].karg_;
             bool group_arg_valid = GridwiseGemm::CheckValidity(a);
-#if DEBUG_LOG
             if(not group_arg_valid)
             {
-                std::cout << "[" << __func__ << "] group id: " << i << " is not supported!\n";
+#if DEBUG_LOG
+                std::cout << "[" << __func__ << "] group id: " << i
+                          << " has invalid GridwiseGemm settings!" << std::endl;
                 a.Print();
-            }
 #endif // DEBUG_LOG
-            supported &= group_arg_valid;
+            }
+            supported = supported && group_arg_valid;
         }
         return supported;
     }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp
index b49e1096829695f7f7d5b0f651a20f7aebc03f6b..aec5a65ccf41eafcbe6ebdf790cc75b0adbf2966 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp
index 17a96e9f6f6c8039485afcab82d5908d92a59a24..6d1d5c8e239de8f57df1baa82d4981e8f30e54b6 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
index 6a8037a3249c4e5b8ebc6a72696623f8d4022f06..ea0d805043e07fe3d818fbb7bf7ef42e952496a9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
index 0026a8759363b88cbec59cf010d9cf5c94cad753..8b2b3c41bfdf3f4793263dd65fc5f6dc941eb13b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp
index 7b96373c0ff1fc86a79c4248db35dd5812f2c5b1..17dab08332166e23a93d5c8f25ecfec7556519ac 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp b/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
index 6933db68dccd83fe4e036a0609b7eaa7517a3902..3f27c629dd6ac88a41b0072a1f51afc3b3fb3b25 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp b/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
index d330fda8ce4ea8dd290f3a393dd4004ae55d9cc2..0ab6c247584881bd72723b6cda2f412aea9a2e1c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
index 5dc051be3cb7f23fef076921e826a7e55e017d76..2481c5c76971db6bb2ddc390510089af5f656ae6 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
index c7868537fe8220c640ba24d346eea5dc219490fa..bf3deeb57acbdff95995f5da096ab574140bcb42 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
index aa255da645e53e0f0a6bf9578d0c64b538b30c8e..6c5895b010659cafb98e4a78bc6587d0222e02bd 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
index ed96b7340cfc6c78468d8fb9bfb2df21243918ef..4aa02dfd322b622ea5684fb05157e457d89a9eec 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
index 2f29224a754057430cd68bb551807226d40a6cb5..7a62ec04650a94d1b9f458e425ddf3b14881091d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
index ea0f5897a757c408047b20e0cd4285e13f67c4f5..d6d6f74abdb5a63ef8a841d033c9f47ec59bd07e 100644
--- a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/matrix_padder.hpp b/include/ck/tensor_operation/gpu/device/matrix_padder.hpp
index 70e61bc7728f9638e5bef874ab0d5caad2db5028..c66d2fc516babe3aa5a16de053d9ada89eabf8ef 100644
--- a/include/ck/tensor_operation/gpu/device/matrix_padder.hpp
+++ b/include/ck/tensor_operation/gpu/device/matrix_padder.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp b/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
index d35318357a9bcc6fb087c92e27dcf435ff60796b..5351d4ef2439307ad5623f4172217937812ab29a 100644
--- a/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
+++ b/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
index b44427411f9a28511fd16f6b47ee37fa84a0e2d3..b2d141fd61aa4550ef8b572e548688120634b8b9 100644
--- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/tensor_specialization.hpp b/include/ck/tensor_operation/gpu/device/tensor_specialization.hpp
index 0ec0df2c9bbbb2aac987d681a36d5b00a8349c97..713fc93ebb4eec9ce0301affd667656bf0b6c999 100644
--- a/include/ck/tensor_operation/gpu/device/tensor_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_specialization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/device/welford_helper.hpp b/include/ck/tensor_operation/gpu/device/welford_helper.hpp
index 6c909b767d4602914660d045b9aaf5a2307ea57b..d7772d8764ff3cc66572775f750997fae944800e 100644
--- a/include/ck/tensor_operation/gpu/device/welford_helper.hpp
+++ b/include/ck/tensor_operation/gpu/device/welford_helper.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
index 136017c6d17027a67fee3b493a20a8d3fabe2bd4..1dd96809d5f79ed4921fa6807625727279107df7 100644
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
index ceb2b665b91529f0a97cfb1209b2de40744ee466..3fdb391a0215e83328905b77148b675defc27af7 100644
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index ef250b8bfd1cbefabcdc3b95a0d94e0c6168c762..c3e7706ef3f55908fe165136939d9cff5c8eec01 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
index a72a4ee068f2bbcb1a915df0368c9b05caa298f9..e73e7e681721c4558c7b58b0bce962b6e156ec58 100644
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
index 08cb0dd191303813a9283b19e38e2952fa0ccf27..fc263138f9dd7ed3057e5259d015b70450abbf21 100644
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
index 548d7fd40ac70fe7fde5e5973a177600b0941c5c..1f8990e6df835115ce0d471134553a14494c1b82 100644
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp
index 42b7e172b239e69ae0ca840dd80d8bb203a5480f..6fe78edb34df290de485f0d015c3ccd86e82b65b 100644
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
index 9bd860f397d37f49fba2c99f25dddf1c94dec374..5c8b9f419ae11d62a23f83320a8659122fb5d52d 100644
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -109,30 +109,57 @@ struct BlockToCTileMap_M00_N0_M01
 
 // Rows of column-vectors
 // This C-tile map dynamically adjusts M01 when C-tile index is out of range
-template <index_t MPerBlock, index_t NPerBlock, typename CGridDesc_M_N>
-struct BlockToCTileMap_M00_N0_M01Adapt
+template <index_t MPerBlock, index_t NPerBlock, typename CGridDesc_M_N = void>
+struct BlockToCTileMap_M00_N0_M01Adapt;
+
+template <index_t MPerBlock, index_t NPerBlock>
+struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
-    static constexpr auto I2 = Number<2>{};
-    static constexpr auto I3 = Number<3>{};
 
     __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt() = default;
 
+    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(const BlockToCTileMap_M00_N0_M01Adapt&) =
+        default;
+    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(BlockToCTileMap_M00_N0_M01Adapt&&) =
+        default;
+    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt&
+    operator=(const BlockToCTileMap_M00_N0_M01Adapt&) = default;
+    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt&
+    operator=(BlockToCTileMap_M00_N0_M01Adapt&&) = default;
+
+    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(index_t M, index_t N, index_t M01 = 8)
+        : M_(M), N_(N), M01_(M01)
+    {
+    }
+
+    template <typename CGridDesc_M_N>
     __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(const CGridDesc_M_N& c_grid_desc_m_n,
                                                         index_t M01 = 8)
-        : M01_(M01), c_grid_desc_m_n_(c_grid_desc_m_n)
+        : BlockToCTileMap_M00_N0_M01Adapt(
+              c_grid_desc_m_n.GetLength(I0), c_grid_desc_m_n.GetLength(I1), M01)
     {
     }
 
-    __host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n) const
+    __host__ static constexpr index_t CalculateGridSize(index_t M, index_t N)
     {
-        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
-        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I1), NPerBlock);
+        const auto M0 = math::integer_divide_ceil(M, MPerBlock);
+        const auto N0 = math::integer_divide_ceil(N, NPerBlock);
 
-        const index_t grid_size = M0 * N0;
+        return M0 * N0;
+    }
 
-        return grid_size;
+    template <typename CGridDesc_M_N>
+    __host__ static constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        return CalculateGridSize(c_grid_desc_m_n.GetLength(I0), c_grid_desc_m_n.GetLength(I1));
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const
+    {
+        return true;
     }
 
     template <typename TopIdx>
@@ -140,8 +167,8 @@ struct BlockToCTileMap_M00_N0_M01Adapt
     {
         auto block_1d_id = idx_top[I0];
 
-        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n_.GetLength(I0), MPerBlock);
-        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n_.GetLength(I1), NPerBlock);
+        const auto M0 = math::integer_divide_ceil(M_, MPerBlock);
+        const auto N0 = math::integer_divide_ceil(N_, NPerBlock);
 
         block_1d_id = block_1d_id % (M0 * N0); // swallow batch index
 
@@ -209,11 +236,18 @@ struct BlockToCTileMap_M00_N0_M01Adapt
         return true; // always valid provided that user gets grid size from CalculateGridSize()
     }
 
-    __host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const { return true; }
-
     private:
+    index_t M_;
+    index_t N_;
     index_t M01_;
-    CGridDesc_M_N c_grid_desc_m_n_;
+};
+
+// keep the redundant type argument for backward compatibility
+template <index_t MPerBlock, index_t NPerBlock, typename CGridDesc_M_N>
+struct BlockToCTileMap_M00_N0_M01Adapt : BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
+{
+    using BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>::
+        BlockToCTileMap_M00_N0_M01Adapt;
 };
 
 // 2D slices of column-vectors in 3D space
diff --git a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
index aa34cfbf84ab9425a236646400c1a965c8004f74..523e7f7c539da7ac6e5729c6e87b4a21f1bd3f6b 100644
--- a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_welford_second_half_layernorm2d.hpp b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_welford_second_half_layernorm2d.hpp
index fbe89e7e5e5936c2db1470e63f922cc7f6c5ff2a..69468c25befe036123cf1796798e6a59ea47f74b 100644
--- a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_welford_second_half_layernorm2d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_welford_second_half_layernorm2d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp
index bdebe3816f22d6a4f10e913210abc8c8347e20d0..bd1e0585fc96ad29ee9e5336993f2ba6570fb31d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp
index 1313ec9435e782ad9cc9b92580464f6d5257b93e..fc4f27e33b15b6bfcc7ce942764f9aa5e26cfe1f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
index 6836a66047531d9eef09dbffc58188f7696a783a..203be3c42d9d23c935d10f34ceb44245f7875ad0 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
index 5986641c6f72326bfa7a718c9bdfce57b1949748..910c926c7e470e0a7daa1cd25f57716f5ce4af6b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
index fccb127d0f342911ed6bb1f8396c9d9cd0eec921..a8a1f803fcb260e3eba6ccf301f68b7638e2108b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
index b9f4a3080a0e17ca0ba169bc8b83c006acb2305d..59d6bad5d97dacfda3947868aa8c14d0eaf220cc 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
index 6a6f19d71ef523697d69719de75b70bce3953e7e..74171ea9d99a9079f1056199dfadb83a3ea8543f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
index d6d2051113ff4fdebb74689293b00c532aae9c68..9eb2bf8aa81ea2689a4a8c71c2b78b070e701a4f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
index ede6a96dc9f53045b5fc029f4c4f4f6aa4fa4817..ed1ffdd85765e84feb2f6824a858ab842487c18c 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp
index 33c45a0f037b4c81fbaf810cc10e55f71a2ab254..b6c83af13a18639cf7f8312dd416dbf4d9cb5297 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
index 8b82b65540d1c5612002549f72708be07b649dae..d686c14b350953a48bf132694251ac677fb47c4f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
index 05257d16275e5f9817ec5d9cbbd06b133f0b5e05..bf0e8c186c75cef05b08c7ef9e1bf16c016830a3 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 //
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp
index b09a73590239609908265cd6b72e3e5bb26127aa..3ea72b85345869a938e52f872c2c586c6e36170f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
index bebcdceb43521a3b489e0b4d04e847bcc6661b3d..c5c09e909ec6c49cb080bcd27fa41eeb23ce9226 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
index 9c68b4f5c3ffbb13cd44701121fca7cc113d76cd..27f48a84ba72fe284a964ac295540eac1c4f6766 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
index 578665ea85fb723565e597bbec26457ffb449606..7289a20da03523b0496353a0cea48e0568c50de0 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
index d3f81566e93ab6b72f4edc7acfe097ef50e72270..119c1ea596dc6c7f3cff9b20a94c9f2dbe85e4d4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
index 98331d854490abe4e7f478d817104218cb49c28f..9090c16dcdd08ea39d1a2a591d8c605b20a55248 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
index 3281b910d3d8782fc18f91ee77ce42aa44fc5ba9..f54345b04f2c8da02365d3e123a660b04f51ad58 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
index a3f5324713c2d61e94434370ee9040ab12475279..bb3e6a80bf1e9dfc83c396d91dee142b42ea4b8b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
index aa89bff9ee21873fdc57c839eb800be455005a9a..e7577bdcbd52b87cf986e0c9f92cc494089b9423 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_waveletmodel.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_waveletmodel.hpp
index 2d3a36fca08d7811e406a89b398f401b741a7649..de5a4241986ff64e1797f2304da117178059ea4b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_waveletmodel.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_waveletmodel.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
index 397ae1c1b9b7b9763b7eb39855ddf4c752db949d..740dde5c65f3d02282d6a0e5b05d41c6591de72f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
index 1213cdc263909c6e15c134d5d7962aeb02f84a91..d5552656e39754d566c74ec904a513b5fdf30c04 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -17,17 +17,25 @@
 
 namespace ck {
 
-template <typename GridwiseGemm,
-          typename FloatAB,
-          typename FloatC,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation,
-          typename AGridDesc_AK0_M_AK1,
-          typename BGridDesc_BK0_N_BK1,
-          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename Block2CTileMap,
-          bool HasMainKBlockLoop>
+template <typename GridwiseGemm, bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdl_cshuffle_v1(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(
+        karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared, karg);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <typename GridwiseGemm, typename FloatAB, typename FloatC, bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
@@ -35,55 +43,33 @@ __global__ void
         kernel_gemm_xdl_cshuffle_v1(const FloatAB* __restrict__ p_a_grid,
                                     const FloatAB* __restrict__ p_b_grid,
                                     FloatC* __restrict__ p_c_grid,
-                                    const AElementwiseOperation a_element_op,
-                                    const BElementwiseOperation b_element_op,
-                                    const CElementwiseOperation c_element_op,
-                                    const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-                                    const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-                                    const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                    const Block2CTileMap block_2_ctile_map)
+                                    typename GridwiseGemm::Problem problem)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx940__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_c_grid,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_ctile_map);
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid, p_b_grid, p_c_grid, p_shared, problem);
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
     ignore = p_c_grid;
-    ignore = a_element_op;
-    ignore = b_element_op;
-    ignore = c_element_op;
-    ignore = a_grid_desc_ak0_m_ak1;
-    ignore = b_grid_desc_bk0_n_bk1;
-    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore = block_2_ctile_map;
+    ignore = problem;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 
-template <typename FloatAB,
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename FloatAB,
           typename FloatGemmAcc,
           typename FloatCShuffle,
           typename FloatC,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AGridDesc_AK0_M_AK1,
-          typename BGridDesc_BK0_N_BK1,
-          typename CGridDesc_M_N,
           index_t NumGemmKPrefetchStage,
           index_t BlockSize,
           index_t MPerBlock,
@@ -129,35 +115,396 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
     static constexpr auto I7 = Number<7>{};
 
     // K1 should be Number<...>
-    static constexpr auto AK0 = Number<KPerBlock / AK1Value>{};
-    static constexpr auto BK0 = Number<KPerBlock / BK1Value>{};
-    static constexpr auto AK1 = Number<AK1Value>{};
-    static constexpr auto BK1 = Number<BK1Value>{};
+    static constexpr auto AK0Number = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number = Number<AK1Value>{};
+    static constexpr auto BK1Number = Number<BK1Value>{};
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
+    __host__ static auto CalculateGridSize(index_t M, index_t N)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, 1);
+    }
+
+    __host__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock) * MPerBlock;
+    }
+
+    __host__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock) * NPerBlock;
+    }
+
+    __host__ static auto CalculateKPadded(index_t K)
+    {
+        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
+    }
+
+    __host__ static auto CalculateAK0(index_t K)
+    {
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding ||
+                     GemmSpec == GemmSpecialization::KPadding ||
+                     GemmSpec == GemmSpecialization::NKPadding)
+        {
+            return CalculateKPadded(K) / AK1Value;
+        }
+        else
+        {
+            return K / AK1Value;
+        }
+    }
+
+    __host__ static auto CalculateBK0(index_t K)
+    {
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding ||
+                     GemmSpec == GemmSpecialization::KPadding ||
+                     GemmSpec == GemmSpecialization::MKPadding)
+        {
+            return CalculateKPadded(K) / BK1Value;
+        }
+        else
+        {
+            return K / BK1Value;
+        }
+    }
+
+    __host__ static auto CalculateMBlock(index_t M)
+    {
+        return math::integer_divide_floor(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNBlock(index_t N)
+    {
+        return math::integer_divide_floor(N, NPerBlock);
+    }
+
+    __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+        index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(M, MPad - M),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(MPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(N, NPad - N),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(NPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    __host__ __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(M, MPad - M),
+                                                          make_right_pad_transform(N, NPad - N)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+    }
+
+    struct Problem
+    {
+        __host__ Problem(index_t M_,
+                         index_t N_,
+                         index_t K_,
+                         index_t StrideA_,
+                         index_t StrideB_,
+                         index_t StrideC_)
+            : M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideB{StrideB_},
+              StrideC{StrideC_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              KPadded{CalculateKPadded(K_)},
+              AK0{CalculateAK0(K_)},
+              BK0{CalculateBK0(K_)},
+              MBlock{CalculateMBlock(M_)},
+              NBlock{CalculateNBlock(N_)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << "}" << std::endl;
+        }
+
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        index_t StrideC;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KPadded;
+        index_t AK0;
+        index_t BK0;
+        index_t MBlock;
+        index_t NBlock;
+    };
+
+    // Argument
+    struct Argument : public tensor_operation::device::BaseArgument, public Problem
+    {
+        __host__ Argument(const FloatAB* p_a_grid_,
+                          const FloatAB* p_b_grid_,
+                          FloatC* p_c_grid_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideB_,
+                          index_t StrideC_)
+            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_},
+              p_a_grid{p_a_grid_},
+              p_b_grid{p_b_grid_},
+              p_c_grid{p_c_grid_}
+        {
+        }
+
+        const FloatAB* p_a_grid;
+        const FloatAB* p_b_grid;
+        FloatC* p_c_grid;
+    };
+
     // FIXME: pass GridwiseGemmPipe as a template arguement into GridwiseGemm
     using GridwiseGemmPipe = remove_cvref_t<decltype(
         GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>())>;
 
-    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
         // A matrix in LDS memory, dst of blockwise copy
         return make_naive_tensor_descriptor(
-            make_tuple(AK0, Number<MPerBlock>{}, AK1),
-            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
+            make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1Number, AK1Number, I1));
     }
 
-    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
     {
         // B matrix in LDS memory, dst of blockwise copy
         return make_naive_tensor_descriptor(
-            make_tuple(BK0, Number<NPerBlock>{}, BK1),
-            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
+            make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
+            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1Number, BK1Number, I1));
     }
 
-    __host__ __device__ static constexpr auto
-    GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    __device__ static constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
     {
         constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
         constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
@@ -172,14 +519,14 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
     }
 
-    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
     {
         // LDS allocation for A and B: be careful of alignment
         constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
         constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
 
         // lds max alignment
-        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
 
         constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
             a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
@@ -200,36 +547,102 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
     }
 
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
-    template <typename Block2CTileMap>
-    __host__ __device__ static constexpr bool
-    CheckValidity(const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
-                  const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
-                  const CGridDesc_M_N& c_grid_desc_m_n,
-                  const Block2CTileMap& block_2_ctile_map)
+    __host__ static constexpr bool CheckValidity(const Problem& problem)
     {
         static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
                           (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
                       "Invalid tuning param!");
 
-        const auto M = a_grid_desc_ak0_m_ak1.GetLength(I1);
-        const auto N = b_grid_desc_bk0_n_bk1.GetLength(I1);
-        const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2);
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(problem.M % MPerBlock == 0))
+            {
+                return false;
+            }
+        }
 
-        if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1)))
-            return false;
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(problem.N % NPerBlock == 0))
+            {
+                return false;
+            }
+        }
 
-        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
-            return false;
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding)
+        {
+            if(!(CalculateKPadded(problem.K) % AK1Value == 0) ||
+               !(CalculateKPadded(problem.K) % BK1Value == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(!(problem.K % AK1Value == 0) || !(problem.K % BK1Value == 0))
+            {
+                return false;
+            }
+        }
 
-        // check gridwise gemm pipeline
-        const auto num_k_loop = K / KPerBlock;
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(problem.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(problem.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
 
-        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
         {
-            return false;
+            if(problem.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(problem.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
         }
 
-        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(problem.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(problem.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                return false;
+            }
+        }
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = (CalculateAK0(problem.K) * AK1Value) / KPerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
         {
             return false;
         }
@@ -238,22 +651,17 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         return true;
     }
 
-    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
     {
         const index_t num_loop = K / KPerBlock;
 
         return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
     }
 
-    __host__ __device__ static constexpr auto
-    MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N& c_grid_desc_m_n)
+    template <typename CGridDesc>
+    __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
     {
-        const auto M = c_grid_desc_m_n.GetLength(I0);
-        const auto N = c_grid_desc_m_n.GetLength(I1);
-
-        const auto MBlock = M / MPerBlock;
-        const auto NBlock = N / NPerBlock;
-
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
             c_grid_desc_m_n,
             make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
@@ -265,33 +673,26 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
     }
 
     // return block_id to C matrix tile idx (m0, n0) mapping
-    __host__ __device__ static constexpr auto
-    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n)
-    {
-        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
-            c_grid_desc_m_n);
-    }
-
-    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
-        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}))>;
-
-    using DefaultBlock2CTileMap =
-        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}))>;
+    using Block2CTileMap = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
 
-    template <bool HasMainKBlockLoop, typename Block2CTileMap>
+    template <bool HasMainKBlockLoop>
     __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
                                const FloatAB* __restrict__ p_b_grid,
                                FloatC* __restrict__ p_c_grid,
                                void* __restrict__ p_shared,
-                               const AElementwiseOperation& a_element_op,
-                               const BElementwiseOperation& b_element_op,
-                               const CElementwiseOperation& c_element_op,
-                               const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
-                               const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
-                               const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
-                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
-                               const Block2CTileMap& block_2_ctile_map)
+                               const Problem& problem)
     {
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -299,7 +700,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
 
+        const AElementwiseOperation a_element_op{};
+        const BElementwiseOperation b_element_op{};
+        const CElementwiseOperation c_element_op{};
+
         // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N};
+
         const auto block_work_idx =
             block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
 
@@ -319,7 +726,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
             __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
 
         // lds max alignment
-        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
 
         // A matrix in LDS memory, dst of blockwise copy
         constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
@@ -333,7 +740,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                                                 AElementwiseOperation,
                                                 ck::tensor_operation::element_wise::PassThrough,
                                                 InMemoryDataOperationEnum::Set,
-                                                Sequence<AK0, MPerBlock, AK1>,
+                                                Sequence<AK0Number, MPerBlock, AK1Number>,
                                                 ABlockTransferThreadClusterLengths_AK0_M_AK1,
                                                 ABlockTransferThreadClusterArrangeOrder,
                                                 FloatAB,
@@ -364,7 +771,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                                                 BElementwiseOperation,
                                                 ck::tensor_operation::element_wise::PassThrough,
                                                 InMemoryDataOperationEnum::Set,
-                                                Sequence<BK0, NPerBlock, BK1>,
+                                                Sequence<BK0Number, NPerBlock, BK1Number>,
                                                 BBlockTransferThreadClusterLengths_BK0_N_BK1,
                                                 BBlockTransferThreadClusterArrangeOrder,
                                                 FloatAB,
@@ -396,8 +803,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
         //       register
         // sanity check
-        constexpr index_t KPack = math::max(
-            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        constexpr index_t KPack =
+            math::max(math::lcm(AK1Number, BK1Number),
+                      MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
@@ -425,8 +833,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
             static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
             b_block_desc_bk0_n_bk1.GetElementSpaceSize());
 
-        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
-        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
 
         // gridwise GEMM pipeline
         static_assert(std::is_default_constructible_v<GridwiseGemmPipe>);
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
index 2d4ebe7076408b29cffb863630e903a195ceb6de..d805c9fa2b167c84ffe562f4d109f38ae2355acc 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
index acece0fbba42bc322cb3102aaf8898a77b41b9d0..31c59d14eddb6983c43ff97b9999f78aa7f980b2 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
index 8d86f3c1d75190bd2c285044c234d17f41e7f7d2..e9881d645b33e292c46f41542c97929b8a914e8d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
index f4504a940220bd8dc16814526844f67154097836..bd18fdb10d0e1767d39b75c5eacbe8dfceb6dc95 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -7,6 +7,7 @@
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
@@ -21,27 +22,18 @@ template <typename GridwiseGemm,
           typename FloatC,
           typename AGridDesc_K0_M_K1,
           typename BGridDesc_K0_N_K1,
-          typename CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation,
-          typename Block2CTileMap,
+          typename CGridDesc_M_N,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_v2r3(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
-            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
-            const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op,
-            const Block2CTileMap block_2_ctile_map)
+        kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
+                                const FloatAB* __restrict__ p_b_grid,
+                                FloatC* __restrict__ p_c_grid,
+                                const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+                                const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
+                                const CGridDesc_M_N c_grid_desc_m_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx940__))
@@ -53,22 +45,46 @@ __global__ void
                                                   p_shared,
                                                   a_grid_desc_k0_m_k1,
                                                   b_grid_desc_k0_n_k1,
-                                                  c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  block_2_ctile_map);
+                                                  c_grid_desc_m_n);
 #else
     ignore                = p_a_grid;
     ignore                = p_b_grid;
     ignore                = p_c_grid;
     ignore                = a_grid_desc_k0_m_k1;
     ignore                = b_grid_desc_k0_n_k1;
-    ignore                = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2;
-    ignore                = a_element_op;
-    ignore                = b_element_op;
-    ignore                = c_element_op;
-    ignore                = block_2_ctile_map;
+    ignore                = c_grid_desc_m_n;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <typename GridwiseGemm, bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdlops_v2r3(const typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    const auto a_grid_desc_k0_m_k1 =
+        amd_wave_read_first_lane(GridwiseGemm::MakeAGridDescriptor_K0_M_K1(
+            karg.M, karg.MPadded, karg.K, karg.K0, karg.StrideA));
+    const auto b_grid_desc_k0_n_k1 =
+        amd_wave_read_first_lane(GridwiseGemm::MakeBGridDescriptor_K0_N_K1(
+            karg.K, karg.N, karg.NPadded, karg.K0, karg.StrideB));
+    const auto c_grid_desc_m_n = amd_wave_read_first_lane(GridwiseGemm::MakeCGridDescriptor_M_N(
+        karg.M, karg.MPadded, karg.N, karg.NPadded, karg.StrideC));
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(karg.p_a_grid,
+                                                  karg.p_b_grid,
+                                                  karg.p_c_grid,
+                                                  p_shared,
+                                                  a_grid_desc_k0_m_k1,
+                                                  b_grid_desc_k0_n_k1,
+                                                  c_grid_desc_m_n);
+#else
+    ignore                = karg;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 
@@ -77,9 +93,6 @@ template <index_t BlockSize,
           typename FloatAcc,
           typename FloatC,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AGridDesc_K0_M_K1,
-          typename BGridDesc_K0_N_K1,
-          typename CGridDesc_M_N,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
@@ -129,6 +142,105 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
+    __host__ static auto CalculateGridSize(index_t M, index_t N)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, 1);
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ static auto CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(c_grid_desc_m_n), 1, 1);
+    }
+
+    template <typename>
+    __host__ static auto CalculateGridSize(index_t M, index_t N)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, 1);
+    }
+
+    __host__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock) * MPerBlock;
+    }
+
+    __host__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock) * NPerBlock;
+    }
+
+    __host__ static auto CalculateK0(index_t K) { return math::integer_divide_floor(K, K1Value); }
+
+    // Argument
+    struct Problem
+    {
+        __host__ Problem(index_t M_,
+                         index_t N_,
+                         index_t K_,
+                         index_t StrideA_,
+                         index_t StrideB_,
+                         index_t StrideC_)
+            : M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideB{StrideB_},
+              StrideC{StrideC_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              K0{CalculateK0(K)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "K0:" << K0 << "}" << std::endl;
+        }
+
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        index_t StrideC;
+        index_t MPadded;
+        index_t NPadded;
+        index_t K0;
+    };
+
+    // Argument
+    struct Argument : public Problem, public tensor_operation::device::BaseArgument
+    {
+        __host__ Argument(const FloatAB* p_a_grid_,
+                          const FloatAB* p_b_grid_,
+                          FloatC* p_c_grid_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideB_,
+                          index_t StrideC_)
+            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_},
+              p_a_grid{p_a_grid_},
+              p_b_grid{p_b_grid_},
+              p_c_grid{p_c_grid_}
+        {
+        }
+
+        const FloatAB* p_a_grid;
+        const FloatAB* p_b_grid;
+        FloatC* p_c_grid;
+    };
+
     using GridwiseGemmPipe = remove_cvref_t<decltype(
         GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>())>;
 
@@ -204,13 +316,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         return (a_block_space_size_aligned + b_block_space_size_aligned) * sizeof(FloatAB);
     }
 
-    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
-    template <typename Block2CTileMap>
+    template <typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N>
     __host__ __device__ static constexpr bool
     CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
                   const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
-                  const CGridDesc_M_N& c_grid_desc_m_n,
-                  const Block2CTileMap& block_2_ctile_map)
+                  const CGridDesc_M_N& c_grid_desc_m_n)
     {
         static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
                       "wrong! K1 need to be known at compile-time");
@@ -239,7 +349,24 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             return false;
         }
 
-        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ static constexpr bool CheckValidity(const Problem& problem)
+    {
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+
+        static_assert((MPerBlock % (MPerXDL * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXDL)) == 0,
+                      "Invalid tuning param!");
+
+        // check gridwise gemm pipeline
+        const index_t K0      = problem.K / K1Value;
+        const auto num_k_loop = K0 / K0PerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
         {
             return false;
         }
@@ -248,15 +375,16 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         return true;
     }
 
-    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
     {
         const index_t num_loop = K / (K0PerBlock * K1);
 
         return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
     }
 
+    template <typename CGridDesc>
     __host__ __device__ static constexpr auto
-    MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N& c_grid_desc_m_n)
+    MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc& c_grid_desc_m_n)
     {
         constexpr auto max_lds_align = K1;
 
@@ -306,31 +434,23 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
     }
 
     // return block_id to C matrix tile idx (m0, n0) mapping
-    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap(
-        const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */)
+    using Block2CTileMap = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
+
+    template <bool HasMainKBlockLoop,
+              typename AGridDesc_K0_M_K1,
+              typename BGridDesc_K0_N_K1,
+              typename CGridDesc_M_N>
+    __device__ static void Run(const FloatAB* p_a_grid,
+                               const FloatAB* p_b_grid,
+                               FloatC* p_c_grid,
+                               void* __restrict__ p_shared,
+                               const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
+                               const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
+                               const CGridDesc_M_N& c_grid_desc_m_n)
     {
-        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
-            c_grid_desc_m_n);
-    }
+        const auto c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+            MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n);
 
-    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =
-        decltype(MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(CGridDesc_M_N{}));
-    using DefaultBlock2CTileMap = decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1));
-
-    template <bool HasMainKBlockLoop, typename Block2CTileMap = DefaultBlock2CTileMap>
-    __device__ static void
-    Run(const FloatAB* __restrict__ p_a_grid,
-        const FloatAB* __restrict__ p_b_grid,
-        FloatC* __restrict__ p_c_grid,
-        void* __restrict__ p_shared,
-        const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
-        const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
-        const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2& c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-        const AElementwiseOperation& a_element_op,
-        const BElementwiseOperation& b_element_op,
-        const CElementwiseOperation& c_element_op,
-        const Block2CTileMap& block_2_ctile_map)
-    {
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -338,7 +458,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_c_grid, c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetElementSpaceSize());
 
-        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+        const AElementwiseOperation a_element_op{};
+        const BElementwiseOperation b_element_op{};
+        const CElementwiseOperation c_element_op{};
+
+        const auto block_2_ctile_map =
+            Block2CTileMap{c_grid_desc_m_n.GetLength(I0), c_grid_desc_m_n.GetLength(I1)};
 
         // divide block work by [M, N]
         const auto block_work_idx =
@@ -467,6 +592,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
 
         // gridwise GEMM pipeline
+        const auto K0                       = a_grid_desc_k0_m_k1.GetLength(I0);
         const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock);
 
         GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_k0_m_k1,
@@ -565,4 +691,309 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
     }
 };
 
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t K0PerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t K1Value,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_K1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          bool BBlockLdsExtraN,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          index_t NumGemmKPrefetchStage = 1,
+          LoopScheduler LoopSched       = make_default_loop_scheduler(),
+          PipelineVersion PipelineVer   = PipelineVersion::v1>
+struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext
+    : GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                              FloatAB,
+                                              FloatAcc,
+                                              FloatC,
+                                              CGlobalMemoryDataOperation,
+                                              AElementwiseOperation,
+                                              BElementwiseOperation,
+                                              CElementwiseOperation,
+                                              MPerBlock,
+                                              NPerBlock,
+                                              K0PerBlock,
+                                              MPerXDL,
+                                              NPerXDL,
+                                              K1Value,
+                                              MXdlPerWave,
+                                              NXdlPerWave,
+                                              ABlockTransferThreadClusterLengths_K0_M_K1,
+                                              ABlockTransferThreadClusterArrangeOrder,
+                                              ABlockTransferSrcAccessOrder,
+                                              ABlockTransferSrcVectorDim,
+                                              ABlockTransferSrcScalarPerVector,
+                                              ABlockTransferDstScalarPerVector_K1,
+                                              AThreadTransferSrcResetCoordinateAfterRun,
+                                              ABlockLdsExtraM,
+                                              BBlockTransferThreadClusterLengths_K0_N_K1,
+                                              BBlockTransferThreadClusterArrangeOrder,
+                                              BBlockTransferSrcAccessOrder,
+                                              BBlockTransferSrcVectorDim,
+                                              BBlockTransferSrcScalarPerVector,
+                                              BBlockTransferDstScalarPerVector_K1,
+                                              BThreadTransferSrcResetCoordinateAfterRun,
+                                              BBlockLdsExtraN,
+                                              CThreadTransferSrcDstAccessOrder,
+                                              CThreadTransferSrcDstVectorDim,
+                                              CThreadTransferDstScalarPerVector,
+                                              NumGemmKPrefetchStage,
+                                              LoopSched,
+                                              PipelineVer>
+{
+    using Parent =
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                CGlobalMemoryDataOperation,
+                                                AElementwiseOperation,
+                                                BElementwiseOperation,
+                                                CElementwiseOperation,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                K0PerBlock,
+                                                MPerXDL,
+                                                NPerXDL,
+                                                K1Value,
+                                                MXdlPerWave,
+                                                NXdlPerWave,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                ABlockLdsExtraM,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockLdsExtraN,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                NumGemmKPrefetchStage,
+                                                LoopSched,
+                                                PipelineVer>;
+
+    using typename Parent::GridwiseGemmPipe;
+    using typename Parent::Problem;
+
+    using Parent::I1;
+
+    using Parent::K1;
+
+    __device__ static auto
+    MakeAGridDescriptor_K0_M_K1(index_t M, index_t MPad, index_t K, index_t K0, index_t StrideA)
+    {
+        const auto a_grid_desc_m_k = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
+        {
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+    }
+
+    __device__ static auto
+    MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t NPad, index_t K0, index_t StrideB)
+    {
+        const auto b_grid_desc_k_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
+            }
+        }();
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
+        {
+            return transform_tensor_descriptor(
+                b_grid_desc_k_n,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                b_grid_desc_k_n,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+    }
+
+    __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
+        {
+            return transform_tensor_descriptor(c_grid_desc_m_n,
+                                               make_tuple(make_right_pad_transform(M, MPad - M),
+                                                          make_right_pad_transform(N, NPad - N)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
+    __host__ static constexpr bool CheckValidity(const Problem& problem)
+    {
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+
+        static_assert((MPerBlock % (MPerXDL * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXDL)) == 0,
+                      "Invalid tuning param!");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(problem.M % MPerBlock == 0))
+            {
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(problem.N % NPerBlock == 0))
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(problem.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(problem.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(problem.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(problem.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+
+        // check gridwise gemm pipeline
+        const index_t K0      = problem.K / K1;
+        const auto num_k_loop = K0 / K0PerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+};
+
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
index 55f465a037f438287d176cb131b3ae96c0d5138d..33a4f2b2c52de5f8f44ea4017058b6f89d58e917 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
index b393c4897e2eef60a274c9932b6e68bde34d55cd..a4b320ddfa82d5b163c615183678b5b7cb46a662 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -8,14 +8,14 @@
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 
 namespace ck {
 
@@ -55,6 +55,7 @@ template <index_t BlockSize,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
           tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
           index_t MPerBlock,
           index_t NPerBlock,
           index_t K0PerBlock,
@@ -82,7 +83,9 @@ template <index_t BlockSize,
           index_t CShuffleMRepeatPerShuffle,
           index_t CShuffleNRepeatPerShuffle,
           index_t CBlockTransferScalarPerVector_NWaveNPerXDL,
-          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>
+          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          LoopScheduler LoopSched     = make_default_loop_scheduler(),
+          PipelineVersion PipelineVer = PipelineVersion::v1>
 struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
 {
     static constexpr auto I0 = Number<0>{};
@@ -99,8 +102,15 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
     static constexpr auto M01 = 1;
     static constexpr auto N01 = 1;
 
+    static constexpr auto gemm_padder =
+        tensor_operation::device::GemmPadder<GemmSpec, index_t, index_t, index_t>{
+            MPerBlock, NPerBlock, K1* K0PerBlock};
+
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>())>;
+
     struct Argument : public ck::tensor_operation::device::BaseArgument
     {
         const FloatAB* p_a_grid;
@@ -176,12 +186,12 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
     // prefer this to be called on host
     __host__ __device__ static auto CalculateMPadded(index_t M)
     {
-        return (M + MPerBlock - 1) / MPerBlock * MPerBlock;
+        return math::integer_least_multiple(M, MPerBlock);
     }
 
     __host__ __device__ static auto CalculateNPadded(index_t N)
     {
-        return (N + NPerBlock - 1) / NPerBlock * NPerBlock;
+        return math::integer_least_multiple(N, NPerBlock);
     }
 
     __host__ __device__ static auto CalculateK0(index_t K, index_t K_Batch = 1)
@@ -295,8 +305,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         }
     }
 
-    __host__ __device__ static auto
-    MakeCGridDescriptor_M_N(index_t M, index_t N, index_t MPad, index_t NPad, index_t StrideC)
+    __host__ __device__ static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
     {
         const auto c_grid_desc_m_n = [&]() {
             if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
@@ -309,22 +318,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
             }
         }();
 
-        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
-        {
-            return transform_tensor_descriptor(c_grid_desc_m_n,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
+        return gemm_padder.PadCDescriptor_M_N(c_grid_desc_m_n);
     }
 
     __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
@@ -383,7 +377,15 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
         {
             if(!(karg.M % MPerBlock == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
         }
         if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
@@ -391,40 +393,116 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
         {
             if(!(karg.N % NPerBlock == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
         }
 
         if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
         {
             if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K (" << karg.K
+                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
         }
         else
         {
             if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M (" << karg.M
+                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
         }
 
         if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
         {
             if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N (" << karg.N
+                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
         }
         else
         {
             if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K (" << karg.K
+                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
         }
 
         if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
         {
             if(karg.N % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
+            {
+#if DEBUG_LOG
+                std::cout
+                    << "Arg N (" << karg.N
+                    << ") value is not a multiple of CBlockTransferScalarPerVector_NWaveNPerXDL ("
+                    << CBlockTransferScalarPerVector_NWaveNPerXDL << " )! " << __FILE__ << ":"
+                    << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
         }
         else
         {
             if(karg.M % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
+            {
+#if DEBUG_LOG
+                std::cout
+                    << "Arg M (" << karg.M
+                    << ") value is not a multiple of CBlockTransferScalarPerVector_NWaveNPerXDL ("
+                    << CBlockTransferScalarPerVector_NWaveNPerXDL << " )! " << __FILE__ << ":"
+                    << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
                 return false;
+            }
+        }
+
+        const auto num_k_loop = karg.K0 / K0PerBlock;
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+#if DEBUG_LOG
+            std::cout << "The number of k loops (" << num_k_loop
+                      << ") value is not supported by GridwiseGemm Pipeline."
+                      << " K0: " << karg.K0 << ", K0PerBlock: " << K0PerBlock << " " << __FILE__
+                      << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+#endif // DEBUG_LOG
+            return false;
         }
 
         return true;
@@ -439,9 +517,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
 
     __host__ __device__ static constexpr bool CalculateHasMainK0BlockLoop(index_t K0)
     {
-        const bool has_main_k0_block_loop = K0 > K0PerBlock;
-
-        return has_main_k0_block_loop;
+        const index_t num_loop = K0 / K0PerBlock;
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
     }
 
     template <typename CGridDesc>
@@ -490,7 +567,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         return BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>();
     }
 
-    using CGridDesc_M_N         = remove_cvref_t<decltype(MakeCGridDescriptor_M_N(1, 1, 1, 1, 1))>;
+    using CGridDesc_M_N         = remove_cvref_t<decltype(MakeCGridDescriptor_M_N(1, 1, 1))>;
     using DefaultBlock2CTileMap = remove_cvref_t<decltype(MakeDefaultBlock2CTileMap())>;
 
     template <bool HasMainKBlockLoop,
@@ -507,8 +584,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
             karg.M, karg.MPadded, karg.K, karg.StrideA, karg.k_batch, karg.K0, karg.KPadded);
         const auto b_b_k0_n_k1_grid_desc = MakeBGridDescriptor_KBatch_K0_N_K1(
             karg.K, karg.NPadded, karg.N, karg.StrideB, karg.k_batch, karg.K0, karg.KPadded);
-        const auto c_grid_desc_m_n =
-            MakeCGridDescriptor_M_N(karg.M, karg.N, karg.MPadded, karg.NPadded, karg.StrideC);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(karg.M, karg.N, karg.StrideC);
 
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n);
@@ -680,20 +756,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
         //       register
         // sanity check
-#if 1
-        auto blockwise_gemm =
-            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                FloatAB,
-                                                                FloatAcc,
-                                                                decltype(a_k0_m_k1_block_desc),
-                                                                decltype(b_k0_n_k1_block_desc),
-                                                                MPerXDL,
-                                                                NPerXDL,
-                                                                MRepeat,
-                                                                NRepeat,
-                                                                K1>{};
-#else
-        auto blockwise_gemm = BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<
+
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
             FloatAB,
             FloatAcc,
@@ -703,9 +767,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
             NPerXDL,
             MRepeat,
             NRepeat,
-            K1>{};
-
-#endif
+            K1,
+            LoopSched>();
 
         auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
 
@@ -761,7 +824,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                 b_blockwise_copy.RunWrite(b_b_k0_n_k1_block_desc, b_block_buf);
 
                 k0_block_data_begin += K0PerBlock;
-            } while(k0_block_data_begin < (K0 - K0PerBlock));
+            } while(k0_block_data_begin < (karg.K0 - K0PerBlock));
         }
 
         // tail
@@ -772,13 +835,12 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         }
 #else
         // gridwise GEMM pipeline
-        const auto gridwise_gemm_pipeline =
-            GridwiseGemmPipeline_Selector<PipelineVersion::v2, 1, LoopScheduler::Default>();
-
         const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
             (a_b_k0_m_k1_grid_desc.GetLength(I1) * a_b_k0_m_k1_grid_desc.GetLength(I3)) /
             (K0PerBlock * K1));
 
+        const auto gridwise_gemm_pipeline = GridwiseGemmPipe{};
+
         gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_b_k0_m_k1_grid_desc,
                                                                a_b_k0_m_k1_block_desc,
                                                                a_blockwise_copy,
@@ -993,24 +1055,6 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
         }
     }
 
-    template <typename Layout>
-    struct LStr
-    {
-        static std::string Get() { return ""; }
-    };
-
-    template <>
-    struct LStr<ck::tensor_layout::gemm::RowMajor>
-    {
-        static std::string Get() { return "R"; }
-    };
-
-    template <>
-    struct LStr<ck::tensor_layout::gemm::ColumnMajor>
-    {
-        static std::string Get() { return "C"; }
-    };
-
     static std::string GetTypeString()
     {
         auto str = std::stringstream();
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
index 8259927fecefa8bacd416289530c5b359b9aa5a4..df543c063fa24fec4c4c8e5448840e28e7cc9615 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
index 5d5fdae170b17970b6105406b8e19889dd766549..ec98fc9c934321d0fccc8998b0bc19d560bdbffc 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
index dc83f8e98493d209104fd4cbdd7f1254b2de72a2..3a752dd749693b685f6a3c391d319ed98fa58720 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
index de1ae915920503a086d2cf939ae6e41d27a28838..61d0f9e0d5573c774bbb1dcba707e8c2e775f075 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
index 901e7aee98a7b6ab1c37b7dd4dc36f796632fef6..41352fabeb361b017949738bda31649a3ce0dd40 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp
index 88c7b6acfeb9c43ec0e8b5a5ab557f834430fb40..0ad36b418a42bff56eb87ba35bb5150808e9cac4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
index 0344e68305b1a951cfc41a53666090bd1e0113b6..5f56ac6fc4664be1685a950dc6e242b3e256e178 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp
index ff2511fa6e61eda6c4cca976d0372cf98840ee5c..ee68660a06a2933e45ead9c05fadc674ec9d5ae7 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp
index 792ffabcb90c71b4620915572c29091b67c8c4d6..c3f122106df29a471e376c453d055806c9aa2514 100644
--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp
index 632690e1eff9af4cb0ab655250dca381c625776a..e50fb9813325b08576f49d321e7a5de5d51bbff2 100644
--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp
index 129b4e1165d34a25087722d4d0c2c0bd8b160c30..fc42e9762997d0c554130beb3d73ac0eebf16476 100644
--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp
index d796d1afc0c7960f80ffda59ef50e01f1e5bf98b..136ac94e7f0fabb81cfd5baea7e58174799f4d43 100644
--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp
index 3a7ae459e5f6e8d58edc49ae53c4309030e911f9..ff9712276c26d14c9a68f9df618ed4666f2dc5e9 100644
--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp b/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
index 188c62d93b0d60317ac139115b79f496612d2ea6..c6eecc067d9f77fa488e7a922fe1fb711c0c0d0f 100644
--- a/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp
index 94cdfe01087bc4005d45f11992a6a8f94d42691c..44730d551c5c8c9fb2408502ece0d45c6f37a221 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp
index e045e3b545a9897f5df9df1948123e86424d7c7b..e97aa433a6f3b9d54fe76ed26b5b0f6bd439ae84 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_THREADWISE_GEMM_DLOPS_V3_HPP
 #define CK_THREADWISE_GEMM_DLOPS_V3_HPP
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp
index 0a1197a1630c12c53afa9513f10fcfa907ba82d3..6774a35bcb872466d272da0bccd618709d3fb29b 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
index 16484ddcc5a89792c1331a48516e8317b36f2633..605f2569c6622bb589d46da71020ea46ddba69f7 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp
index 6e8a23930bbb91677ee18bab216af6c45de72e4c..6a6c1f2ac5bfed7f02c5f5895d16002226cff56c 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
index f13da341f9b2d36d992b1cd1835b43c89dcabd1f..bd01108b03ca82d4877e19806521efda8da8c6c3 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
index 9c91cd9ca8f86df37fee6dfdf597be1cefe2d683..6ec9abc41708735011acda7fcd01e54cbec5c4a6 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
index 68bc2726f4be6fa1b08fe9213e0db9440526f975..cf2c7a2aee3d3f612fd12c0868477bfdfe49a2ea 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
index 0f5fb88b04540dbf355fb0f02998303b7680c193..b5847e51b42ea5b567133f0307b319fa3d8fadbb 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp
index 2eb1b0ee90a446ee3b14354aff018275cf7809f4..db7dee21992d0e4edaa2e7a443b5c0ed68589107 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp
index 12ba2c5381311ab45ffad4be793651d176f5fc59..eb6715e8ebb382e438791f60193bfc23f06d1773 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
index 0672bf8e5b2e15421f90bae642800a1b7ffcc0d3..979f3567e987ed66fd45af5deeae7bb2ac3fc761 100644
--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -262,12 +262,12 @@ struct wmma_type<WmmaInstr::wmma_i32_16x16x16_iu8,
 
     template <index_t MPerWmma,
               index_t NPerWmma,
-              bool neg_a,
-              bool neg_b,
-              bool clamp,
               class FloatA,
               class FloatB,
-              class FloatC>
+              class FloatC,
+              bool neg_a = false,
+              bool neg_b = false,
+              bool clamp = false>
     __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
     {
         if constexpr(wave_size == 32)
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 319487bc053ccb7ae4598ca467836fd07196d7c8..faaa2c5a9537537a7d2367c87ef9ff66f8028318 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
index 5fc11d9158ab9a1261c1285c2cd9f5208d7afbf3..ea27a40ce3c7a48564af112a97af342c157f746d 100644
--- a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
index 13d0a28cfe5b2db14a83a4b81e0bb379aabbe137..505ed33d50f597e5bd2094ff1b5f9060fa2f3aa9 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
index 1b5e64b66cf292fbcf4979d95aa4a5b1fa3e3a8b..cee3d2825b13facc912cdc8de21774ce793e9a2c 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
@@ -1,6 +1,6 @@
 
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/amd_address_space.hpp b/include/ck/utility/amd_address_space.hpp
index 9f1525914cdee16ee6cbed51297f0008d46f6085..d54f70e750e8032f706ae3f7f0dfec9f7cab4c8d 100644
--- a/include/ck/utility/amd_address_space.hpp
+++ b/include/ck/utility/amd_address_space.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp
index bdfb4f27580e011f0bebabfeae9146374a5ecd67..38ee76d8836d15a85be36e601f574c4f3ec8e539 100644
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 #include "data_type.hpp"
@@ -286,7 +286,22 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
                                        int soffset,    // dst_wave_addr_offset
                                        int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
 
-template <typename T, index_t N>
+// memory coherency bit for buffer store/load instruction
+// check ISA manual for each GFX target
+// e.g. for
+// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
+// page 67~68
+enum struct AmdBufferCoherenceEnum
+{
+    DefaultCoherence = 0, // default value
+    GLC              = 1,
+    SLC              = 2,
+    GLC_SLC          = 3,
+};
+
+template <typename T,
+          index_t N,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
                                                                  index_t src_thread_addr_offset,
                                                                  index_t src_wave_addr_offset)
@@ -305,28 +320,37 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
         // use fp32 load to mimic fp64 load
         if constexpr(N == 1)
         {
-            const float2_t tmp = llvm_amdgcn_raw_buffer_load_fp32x2(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            const float2_t tmp =
+                llvm_amdgcn_raw_buffer_load_fp32x2(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
 
             return bit_cast<double>(tmp);
         }
         else if constexpr(N == 2)
         {
-            const float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            const float4_t tmp =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
 
             return bit_cast<double2_t>(tmp);
         }
         else if constexpr(N == 4)
         {
-            const float4_t f32_0 = llvm_amdgcn_raw_buffer_load_fp32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            const float4_t f32_0 =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
 
             const float4_t f32_1 =
                 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                    src_thread_addr_offset,
                                                    src_wave_addr_offset + 4 * sizeof(float),
-                                                   0);
+                                                   static_cast<index_t>(coherence));
             vector_type<double, 4> tmp;
 
             tmp.AsType<double2_t>()(Number<0>{}) = bit_cast<double2_t>(f32_0);
@@ -339,31 +363,40 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
     {
         if constexpr(N == 1)
         {
-            return llvm_amdgcn_raw_buffer_load_fp32(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_fp32(src_wave_buffer_resource,
+                                                    src_thread_addr_offset,
+                                                    src_wave_addr_offset,
+                                                    static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
-            return llvm_amdgcn_raw_buffer_load_fp32x2(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_fp32x2(src_wave_buffer_resource,
+                                                      src_thread_addr_offset,
+                                                      src_wave_addr_offset,
+                                                      static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
-            return llvm_amdgcn_raw_buffer_load_fp32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                      src_thread_addr_offset,
+                                                      src_wave_addr_offset,
+                                                      static_cast<index_t>(coherence));
         }
         else if constexpr(N == 8)
         {
             vector_type<float, 8> tmp;
 
-            tmp.AsType<float4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            tmp.AsType<float4_t>()(Number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
 
             tmp.AsType<float4_t>()(Number<1>{}) =
                 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                    src_thread_addr_offset,
                                                    src_wave_addr_offset + 4 * sizeof(float),
-                                                   0);
+                                                   static_cast<index_t>(coherence));
 
             return tmp.AsType<float8_t>()(Number<0>{});
         }
@@ -372,24 +405,32 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
     {
         if constexpr(N == 1)
         {
-            return llvm_amdgcn_raw_buffer_load_fp16(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_fp16(src_wave_buffer_resource,
+                                                    src_thread_addr_offset,
+                                                    src_wave_addr_offset,
+                                                    static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
-            return llvm_amdgcn_raw_buffer_load_fp16x2(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_fp16x2(src_wave_buffer_resource,
+                                                      src_thread_addr_offset,
+                                                      src_wave_addr_offset,
+                                                      static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
-            return llvm_amdgcn_raw_buffer_load_fp16x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
+                                                      src_thread_addr_offset,
+                                                      src_wave_addr_offset,
+                                                      static_cast<index_t>(coherence));
         }
         else if constexpr(N == 8)
         {
             // use fp32 load to mimic fp16 load
-            float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                              src_thread_addr_offset,
+                                                              src_wave_addr_offset,
+                                                              static_cast<index_t>(coherence));
 
             return bit_cast<half8_t>(tmp);
         }
@@ -398,23 +439,31 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
     {
         if constexpr(N == 1)
         {
-            return llvm_amdgcn_raw_buffer_load_i16(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
-            return llvm_amdgcn_raw_buffer_load_i16x2(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i16x2(src_wave_buffer_resource,
+                                                     src_thread_addr_offset,
+                                                     src_wave_addr_offset,
+                                                     static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
-            return llvm_amdgcn_raw_buffer_load_i16x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i16x4(src_wave_buffer_resource,
+                                                     src_thread_addr_offset,
+                                                     src_wave_addr_offset,
+                                                     static_cast<index_t>(coherence));
         }
         else if constexpr(N == 8)
         {
-            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                              src_thread_addr_offset,
+                                                              src_wave_addr_offset,
+                                                              static_cast<index_t>(coherence));
 
             return bit_cast<bhalf8_t>(tmp);
         }
@@ -423,31 +472,40 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
     {
         if constexpr(N == 1)
         {
-            return llvm_amdgcn_raw_buffer_load_i32(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
-            return llvm_amdgcn_raw_buffer_load_i32x2(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource,
+                                                     src_thread_addr_offset,
+                                                     src_wave_addr_offset,
+                                                     static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
-            return llvm_amdgcn_raw_buffer_load_i32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                     src_thread_addr_offset,
+                                                     src_wave_addr_offset,
+                                                     static_cast<index_t>(coherence));
         }
         else if constexpr(N == 8)
         {
             vector_type<int32_t, 8> tmp;
 
-            tmp.AsType<int32x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            tmp.AsType<int32x4_t>()(Number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset,
+                                                  static_cast<index_t>(coherence));
 
             tmp.AsType<int32x4_t>()(Number<1>{}) =
                 llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
                                                   src_thread_addr_offset,
                                                   src_wave_addr_offset + 4 * sizeof(int32_t),
-                                                  0);
+                                                  static_cast<index_t>(coherence));
             return tmp.AsType<int32x8_t>()(Number<0>{});
         }
     }
@@ -455,17 +513,23 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
     {
         if constexpr(N == 1)
         {
-            return llvm_amdgcn_raw_buffer_load_i8(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset,
+                                                  static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
 #if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
-            return llvm_amdgcn_raw_buffer_load_i8x2(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i8x2(src_wave_buffer_resource,
+                                                    src_thread_addr_offset,
+                                                    src_wave_addr_offset,
+                                                    static_cast<index_t>(coherence));
 #else
-            int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
+                                                          src_thread_addr_offset,
+                                                          src_wave_addr_offset,
+                                                          static_cast<index_t>(coherence));
 
             return bit_cast<int8x2_t>(tmp);
 #endif
@@ -473,11 +537,15 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
         else if constexpr(N == 4)
         {
 #if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
-            return llvm_amdgcn_raw_buffer_load_i8x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                    src_thread_addr_offset,
+                                                    src_wave_addr_offset,
+                                                    static_cast<index_t>(coherence));
 #else
-            int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource,
+                                                          src_thread_addr_offset,
+                                                          src_wave_addr_offset,
+                                                          static_cast<index_t>(coherence));
 
             return bit_cast<int8x4_t>(tmp);
 #endif
@@ -487,19 +555,24 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
 #if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
             vector_type<int8_t, 8> tmp;
 
-            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            tmp.AsType<int8x4_t>()(Number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset,
+                                                 static_cast<index_t>(coherence));
 
             tmp.AsType<int8x4_t>()(Number<1>{}) =
                 llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 4 * sizeof(int8_t),
-                                                 0);
+                                                 static_cast<index_t>(coherence));
 
             return tmp.AsType<int8x8_t>()(Number<0>{});
 #else
-            int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource,
+                                                              src_thread_addr_offset,
+                                                              src_wave_addr_offset,
+                                                              static_cast<index_t>(coherence));
 
             return bit_cast<int8x8_t>(tmp);
 #endif
@@ -509,31 +582,36 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
 #if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
             vector_type<int8_t, 16> tmp;
 
-            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            tmp.AsType<int8x4_t>()(Number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset,
+                                                 static_cast<index_t>(coherence));
 
             tmp.AsType<int8x4_t>()(Number<1>{}) =
                 llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 4 * sizeof(int8_t),
-                                                 0);
+                                                 static_cast<index_t>(coherence));
 
             tmp.AsType<int8x4_t>()(Number<2>{}) =
                 llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 8 * sizeof(int8_t),
-                                                 0);
+                                                 static_cast<index_t>(coherence));
 
             tmp.AsType<int8x4_t>()(Number<3>{}) =
                 llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 12 * sizeof(int8_t),
-                                                 0);
+                                                 static_cast<index_t>(coherence));
 
             return tmp.AsType<int8x16_t>()(Number<0>{});
 #else
-            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                              src_thread_addr_offset,
+                                                              src_wave_addr_offset,
+                                                              static_cast<index_t>(coherence));
 
             return bit_cast<int8x16_t>(tmp);
 #endif
@@ -541,7 +619,9 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
     }
 }
 
-template <typename T, index_t N>
+template <typename T,
+          index_t N,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src_thread_data,
                                       int32x4_t dst_wave_buffer_resource,
                                       index_t dst_thread_addr_offset,
@@ -565,7 +645,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
@@ -573,7 +653,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
         }
     }
     else if constexpr(is_same<T, float>::value)
@@ -584,7 +664,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
-                                              0);
+                                              static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
@@ -592,7 +672,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
@@ -600,7 +680,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
         }
     }
     else if constexpr(is_same<T, half_t>::value)
@@ -611,7 +691,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
-                                              0);
+                                              static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
@@ -619,7 +699,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
@@ -627,7 +707,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
         }
         else if constexpr(N == 8)
         {
@@ -638,19 +718,19 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
 
             llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<1>{}],
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset + 4 * sizeof(half_t),
-                                                0);
+                                                static_cast<index_t>(coherence));
 #else
             llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<float4_t>(src_thread_data),
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
-                                                0);
+                                                static_cast<index_t>(coherence));
 #endif
         }
     }
@@ -662,7 +742,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
-                                             0);
+                                             static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
@@ -670,7 +750,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
-                                               0);
+                                               static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
@@ -678,7 +758,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
-                                               0);
+                                               static_cast<index_t>(coherence));
         }
         else if constexpr(N == 8)
         {
@@ -688,13 +768,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
-                                               0);
+                                               static_cast<index_t>(coherence));
 
             llvm_amdgcn_raw_buffer_store_i16x4(tmp.AsType<bhalf4_t>()[Number<1>{}],
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset + 4 * sizeof(bhalf_t),
-                                               0);
+                                               static_cast<index_t>(coherence));
         }
     }
     else if constexpr(is_same<T, int32_t>::value)
@@ -705,7 +785,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
-                                             0);
+                                             static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
@@ -713,7 +793,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
-                                               0);
+                                               static_cast<index_t>(coherence));
         }
         else if constexpr(N == 4)
         {
@@ -721,7 +801,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
-                                               0);
+                                               static_cast<index_t>(coherence));
         }
     }
     else if constexpr(is_same<T, int8_t>::value)
@@ -732,7 +812,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
-                                            0);
+                                            static_cast<index_t>(coherence));
         }
         else if constexpr(N == 2)
         {
@@ -741,13 +821,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
-                                              0);
+                                              static_cast<index_t>(coherence));
 #else
             llvm_amdgcn_raw_buffer_store_i16(bit_cast<int16_t>(src_thread_data),
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
-                                             0);
+                                             static_cast<index_t>(coherence));
 #endif
         }
         else if constexpr(N == 4)
@@ -757,13 +837,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
-                                              0);
+                                              static_cast<index_t>(coherence));
 #else
             llvm_amdgcn_raw_buffer_store_i32(bit_cast<int32_t>(src_thread_data),
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
-                                             0);
+                                             static_cast<index_t>(coherence));
 #endif
         }
         else if constexpr(N == 8)
@@ -772,7 +852,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
-                                               0);
+                                               static_cast<index_t>(coherence));
         }
         else if constexpr(N == 16)
         {
@@ -780,7 +860,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
-                                               0);
+                                               static_cast<index_t>(coherence));
         }
     }
 }
@@ -1012,7 +1092,9 @@ __device__ void amd_buffer_atomic_max_impl(const typename vector_type<T, N>::typ
 //   1) p_src_wave must point to global memory space
 //   2) p_src_wave must be a wavewise pointer.
 // It is user's responsibility to make sure that is true.
-template <typename T, index_t N>
+template <typename T,
+          index_t N,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 __device__ typename vector_type_maker<T, N>::type::type
 amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
                                             index_t src_thread_element_offset,
@@ -1032,10 +1114,10 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
 #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
     uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
 
-    return amd_buffer_load_impl<scalar_t, vector_size>(
+    return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
         src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
 #else
-    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
+    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
         src_wave_buffer_resource, src_thread_addr_offset, 0);
 
     return src_thread_element_valid ? tmp : vector_t(0);
@@ -1046,7 +1128,9 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
 //   1) p_src_wave must point to global memory space
 //   2) p_src_wave must be a wavewise pointer.
 // It is user's responsibility to make sure that is true.
-template <typename T, index_t N>
+template <typename T,
+          index_t N,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 __device__ typename vector_type_maker<T, N>::type::type
 amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
                                                         index_t src_thread_element_offset,
@@ -1064,7 +1148,7 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
 
     constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
 
-    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
+    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
         src_wave_buffer_resource, src_thread_addr_offset, 0);
 
     return src_thread_element_valid ? tmp : vector_t(customized_value);
@@ -1074,7 +1158,9 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
 //   1) p_dst_wave must point to global memory
 //   2) p_dst_wave must be a wavewise pointer.
 // It is user's responsibility to make sure that is true.
-template <typename T, index_t N>
+template <typename T,
+          index_t N,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 __device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_data,
                                  T* p_dst_wave,
                                  const index_t dst_thread_element_offset,
@@ -1093,12 +1179,12 @@ __device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::t
 #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
     uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
 
-    amd_buffer_store_impl<scalar_t, vector_size>(
+    amd_buffer_store_impl<scalar_t, vector_size, coherence>(
         src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
 #else
     if(dst_thread_element_valid)
     {
-        amd_buffer_store_impl<scalar_t, vector_size>(
+        amd_buffer_store_impl<scalar_t, vector_size, coherence>(
             src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
     }
 #endif
diff --git a/include/ck/utility/amd_inline_asm.hpp b/include/ck/utility/amd_inline_asm.hpp
index 1f7df70bcd565dcba4d0aee3a49aca32f835bb45..43baa817d3672c0b2eed6059a88351aeb19a5209 100644
--- a/include/ck/utility/amd_inline_asm.hpp
+++ b/include/ck/utility/amd_inline_asm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_AMD_INLINE_ASM_HPP
 #define CK_AMD_INLINE_ASM_HPP
diff --git a/include/ck/utility/amd_wave_read_first_lane.hpp b/include/ck/utility/amd_wave_read_first_lane.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4652ce7a747dab404fad36c37439b50f1346381c
--- /dev/null
+++ b/include/ck/utility/amd_wave_read_first_lane.hpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/functional2.hpp"
+#include "ck/utility/math.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace ck {
+namespace detail {
+
+template <unsigned Size>
+struct get_unsigned_int;
+
+template <>
+struct get_unsigned_int<1>
+{
+    using type = uint8_t;
+};
+
+template <>
+struct get_unsigned_int<2>
+{
+    using type = uint16_t;
+};
+
+template <>
+struct get_unsigned_int<4>
+{
+    using type = uint32_t;
+};
+
+template <unsigned Size>
+using get_unsigned_int_t = typename get_unsigned_int<Size>::type;
+
+} // namespace detail
+
+__device__ inline int32_t amd_wave_read_first_lane(int32_t value)
+{
+    return __builtin_amdgcn_readfirstlane(value);
+}
+
+template <
+    typename Object,
+    typename = std::enable_if_t<std::is_class_v<Object> && std::is_trivially_copyable_v<Object>>>
+__device__ auto amd_wave_read_first_lane(const Object& obj)
+{
+    using Size                = unsigned;
+    constexpr Size SgprSize   = 4;
+    constexpr Size ObjectSize = sizeof(Object);
+
+    auto* const from_obj = reinterpret_cast<const std::byte*>(&obj);
+    alignas(Object) std::byte to_obj[ObjectSize];
+
+    constexpr Size RemainedSize             = ObjectSize % SgprSize;
+    constexpr Size CompleteSgprCopyBoundary = ObjectSize - RemainedSize;
+    for(Size offset = 0; offset < CompleteSgprCopyBoundary; offset += SgprSize)
+    {
+        using Sgpr = detail::get_unsigned_int_t<SgprSize>;
+
+        *reinterpret_cast<Sgpr*>(to_obj + offset) =
+            amd_wave_read_first_lane(*reinterpret_cast<const Sgpr*>(from_obj + offset));
+    }
+
+    if constexpr(0 < RemainedSize)
+    {
+        using Carrier = detail::get_unsigned_int_t<RemainedSize>;
+
+        *reinterpret_cast<Carrier>(to_obj + CompleteSgprCopyBoundary) = amd_wave_read_first_lane(
+            *reinterpret_cast<const Carrier*>(from_obj + CompleteSgprCopyBoundary));
+    }
+
+    /// NOTE: Implicitly start object lifetime. It's better to use std::start_lifetime_at() in this
+    /// scenario
+    return *reinterpret_cast<Object*>(to_obj);
+}
+
+} // namespace ck
diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp
index bf091425485d75a244399f4781e1d44c4b38dd2c..dd7f0b770a11470037b1d33164d830cb9819e8d1 100644
--- a/include/ck/utility/amd_wmma.hpp
+++ b/include/ck/utility/amd_wmma.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_AMD_WMMA_HPP
 #define CK_AMD_WMMA_HPP
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index a742496fc1f2fcf2ef529bbf079c481bc8d99f03..f4a04e2814c6875d7f09b582aee00dba8605f147 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_AMD_XDLOPS_HPP
 #define CK_AMD_XDLOPS_HPP
diff --git a/include/ck/utility/array.hpp b/include/ck/utility/array.hpp
index 370a457fe9d97a621e6f91e78f14733398ebf756..f63ce5e5a07a796888cb60ae8da0c855df75e7ff 100644
--- a/include/ck/utility/array.hpp
+++ b/include/ck/utility/array.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_ARRAY_HPP
 #define CK_ARRAY_HPP
diff --git a/include/ck/utility/array_multi_index.hpp b/include/ck/utility/array_multi_index.hpp
index 9b8d5b95e9f6241b574104313b5b8fa2984ab18d..c0c1ea65fc77dd625aaaf6456a6628e09df27b51 100644
--- a/include/ck/utility/array_multi_index.hpp
+++ b/include/ck/utility/array_multi_index.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_ARRAY_MULTI_INDEX_HPP
 #define CK_ARRAY_MULTI_INDEX_HPP
diff --git a/include/ck/utility/c_style_pointer_cast.hpp b/include/ck/utility/c_style_pointer_cast.hpp
index 6e8b0081587afcb2db159a05ecd5fb940def68ff..610e393a77216500448c3682b5db9b9860a077da 100644
--- a/include/ck/utility/c_style_pointer_cast.hpp
+++ b/include/ck/utility/c_style_pointer_cast.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_C_STYLE_POINTER_CAST_HPP
 #define CK_C_STYLE_POINTER_CAST_HPP
diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp
index 1378bbe448e2e1862fed04596fba7f4b106d03b7..41a9d0b5852491113fb2df52477e31df37c4886b 100644
--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -33,6 +33,7 @@
 #include "ck/utility/debug.hpp"
 
 #include "ck/utility/amd_buffer_addressing.hpp"
+#include "ck/utility/amd_wave_read_first_lane.hpp"
 #include "ck/utility/generic_memory_space_atomic.hpp"
 #include "ck/utility/get_id.hpp"
 #include "ck/utility/thread_group.hpp"
diff --git a/include/ck/utility/container_element_picker.hpp b/include/ck/utility/container_element_picker.hpp
index abc5185e04a5079edb95fe15bfcd6788256d384a..838147e420cc7c95b63273b32ae3b2be2429c017 100644
--- a/include/ck/utility/container_element_picker.hpp
+++ b/include/ck/utility/container_element_picker.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_CONTAINER_ELEMENT_PICKER_HPP
 #define CK_CONTAINER_ELEMENT_PICKER_HPP
diff --git a/include/ck/utility/container_helper.hpp b/include/ck/utility/container_helper.hpp
index c8b02bc5acaf8542bf028fff39c42061db3b3296..9c7b954565d386a8fdecd21052b102e750ab7102 100644
--- a/include/ck/utility/container_helper.hpp
+++ b/include/ck/utility/container_helper.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_CONTAINER_HELPER_HPP
 #define CK_CONTAINER_HELPER_HPP
diff --git a/include/ck/utility/debug.hpp b/include/ck/utility/debug.hpp
index 593bbb711672f7dc6e3db22d877bf249b5dff56d..80346f0d9f6f9e5a6a28dcab4e8a7666639394ca 100644
--- a/include/ck/utility/debug.hpp
+++ b/include/ck/utility/debug.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef UTILITY_DEBUG_HPP
 #define UTILITY_DEBUG_HPP
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
index c6f0d299ef3c35c63ad97424888545edd2874914..02d61f34ed5192480d09f4432eec30e65e7264b9 100644
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -19,7 +19,8 @@ namespace ck {
 template <AddressSpaceEnum BufferAddressSpace,
           typename T,
           typename ElementSpaceSize,
-          bool InvalidElementUseNumericalZeroValue>
+          bool InvalidElementUseNumericalZeroValue,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 struct DynamicBuffer
 {
     using type = T;
@@ -77,13 +78,16 @@ struct DynamicBuffer
 
             if constexpr(InvalidElementUseNumericalZeroValue)
             {
-                return amd_buffer_load_invalid_element_return_zero<remove_cvref_t<T>, t_per_x>(
+                return amd_buffer_load_invalid_element_return_zero<remove_cvref_t<T>,
+                                                                   t_per_x,
+                                                                   coherence>(
                     p_data_, i, is_valid_element, element_space_size_);
             }
             else
             {
                 return amd_buffer_load_invalid_element_return_customized_value<remove_cvref_t<T>,
-                                                                               t_per_x>(
+                                                                               t_per_x,
+                                                                               coherence>(
                     p_data_, i, is_valid_element, element_space_size_, invalid_element_value_);
             }
         }
@@ -173,7 +177,7 @@ struct DynamicBuffer
         {
             constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
 
-            amd_buffer_store<remove_cvref_t<T>, t_per_x>(
+            amd_buffer_store<remove_cvref_t<T>, t_per_x, coherence>(
                 x, p_data_, i, is_valid_element, element_space_size_);
         }
         else if constexpr(GetAddressSpace() == AddressSpaceEnum::Lds &&
@@ -376,14 +380,19 @@ struct DynamicBuffer
     __host__ __device__ static constexpr bool IsDynamicBuffer() { return true; }
 };
 
-template <AddressSpaceEnum BufferAddressSpace, typename T, typename ElementSpaceSize>
+template <AddressSpaceEnum BufferAddressSpace,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence,
+          typename T,
+          typename ElementSpaceSize>
 __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
 {
-    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true>{p, element_space_size};
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true, coherence>{
+        p, element_space_size};
 }
 
 template <
     AddressSpaceEnum BufferAddressSpace,
+    AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence,
     typename T,
     typename ElementSpaceSize,
     typename X,
@@ -391,7 +400,7 @@ template <
 __host__ __device__ constexpr auto
 make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element_value)
 {
-    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false>{
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false, coherence>{
         p, element_space_size, invalid_element_value};
 }
 
diff --git a/include/ck/utility/enable_if.hpp b/include/ck/utility/enable_if.hpp
index 297434b0dddd5f1a680176c3bd099bedfda8dff4..c0a3c99f1fdafea9f151fe9fc319c2f7aaa0ffda 100644
--- a/include/ck/utility/enable_if.hpp
+++ b/include/ck/utility/enable_if.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/functional.hpp b/include/ck/utility/functional.hpp
index 08e730782f386cf5788c64bc04a008dc6cb37b28..91797d24092e3e32ad4a6bd40958952b124d9978 100644
--- a/include/ck/utility/functional.hpp
+++ b/include/ck/utility/functional.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/functional2.hpp b/include/ck/utility/functional2.hpp
index 6f125ca4c944777d02ae1083334bec4602ad68c4..99c65f4eb85b67231557b1916fae10c6568e676d 100644
--- a/include/ck/utility/functional2.hpp
+++ b/include/ck/utility/functional2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/functional3.hpp b/include/ck/utility/functional3.hpp
index 06b67ef7e3fdec25a1ab2e18e23c904342233ebe..97605a7adeb8ae64e4e5a32debe9386295923b6c 100644
--- a/include/ck/utility/functional3.hpp
+++ b/include/ck/utility/functional3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/functional4.hpp b/include/ck/utility/functional4.hpp
index 6eeaf15c9b7ac283f7fcac96c195d28f179b6065..b5f3df8d7c517dfaf01320e41721da174883c2d9 100644
--- a/include/ck/utility/functional4.hpp
+++ b/include/ck/utility/functional4.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_FUNCTIONAL4_HPP
 #define CK_FUNCTIONAL4_HPP
diff --git a/include/ck/utility/generic_memory_space_atomic.hpp b/include/ck/utility/generic_memory_space_atomic.hpp
index 6a1ca966521710ecdb4ceaad9a83457031f4c508..98f40a4363aa2ddf2908fa497151f756b77d6f94 100644
--- a/include/ck/utility/generic_memory_space_atomic.hpp
+++ b/include/ck/utility/generic_memory_space_atomic.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 #include "data_type.hpp"
diff --git a/include/ck/utility/get_id.hpp b/include/ck/utility/get_id.hpp
index 44ff438155d2a9c9ba9b0925d4e9fe95c1fa6bce..77564c6130baf45bfd331e69fc437fb7c7c96d18 100644
--- a/include/ck/utility/get_id.hpp
+++ b/include/ck/utility/get_id.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/ignore.hpp b/include/ck/utility/ignore.hpp
index ac33cbf9a508f5e6402d17d6285ae34a8196f6eb..f70a182fd4e5c6cc623acb2c65b5551b2a1acd14 100644
--- a/include/ck/utility/ignore.hpp
+++ b/include/ck/utility/ignore.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/inner_product.hpp b/include/ck/utility/inner_product.hpp
index b65640bfffed31e319549b7152f0c2a93bfaded9..7828d21d7ff2420b1118881ccc0e3c11261e9ba3 100644
--- a/include/ck/utility/inner_product.hpp
+++ b/include/ck/utility/inner_product.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 #include "data_type.hpp"
diff --git a/include/ck/utility/integral_constant.hpp b/include/ck/utility/integral_constant.hpp
index 9aab4e24214a884bdfb391e0f9040bb3af2630ec..376070eb3d8ac326603b71e52e76949c168f4219 100644
--- a/include/ck/utility/integral_constant.hpp
+++ b/include/ck/utility/integral_constant.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/is_known_at_compile_time.hpp b/include/ck/utility/is_known_at_compile_time.hpp
index 8198154422e5d5228b79c6539bddcd5070d1e25c..2cafc3e6f2fafd247e378446a15a7e16c019c914 100644
--- a/include/ck/utility/is_known_at_compile_time.hpp
+++ b/include/ck/utility/is_known_at_compile_time.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/magic_division.hpp b/include/ck/utility/magic_division.hpp
index a5e8e9216519074d6542d379512c5b8d13ec21c9..f19030d4e9ca38971f3ffbfd5e4ff6b951eb16b3 100644
--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/math.hpp b/include/ck/utility/math.hpp
index 72071992f65d934e91a6976c2cce3084b4a2d4ca..326b0e61eff611512bd668355358de421f9f376a 100644
--- a/include/ck/utility/math.hpp
+++ b/include/ck/utility/math.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp
index a3732b2fe0f0d813a281cefca403f317a6daef81..1cac2cc0c7172b4b1915845d6c7f3fa853653683 100644
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/multi_index.hpp b/include/ck/utility/multi_index.hpp
index 1d544c0906cae3c61f4d7ce27e74e7636c3919b5..9f7ba8bff63b3f6e625b98f66accd875ee118594 100644
--- a/include/ck/utility/multi_index.hpp
+++ b/include/ck/utility/multi_index.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/number.hpp b/include/ck/utility/number.hpp
index f3ca6b61dc6ac08330a6cf1633148bb9bad8cd81..d29afd31a7ef03ef3fdbf11f8e9a88132e07fe82 100644
--- a/include/ck/utility/number.hpp
+++ b/include/ck/utility/number.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_NUMBER_HPP
 #define CK_NUMBER_HPP
diff --git a/include/ck/utility/reduction_common.hpp b/include/ck/utility/reduction_common.hpp
index aceef7b296da7466c484aad8033fe1f949662fca..3777d297c8c924cbf4a58a4fa79751be2cfa2f44 100644
--- a/include/ck/utility/reduction_common.hpp
+++ b/include/ck/utility/reduction_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/reduction_enums.hpp b/include/ck/utility/reduction_enums.hpp
index 67856331059cef628a62e4b2223f262b839bbf02..23b7149f8eb022a00d1aed81b4b69c315ca37ba0 100644
--- a/include/ck/utility/reduction_enums.hpp
+++ b/include/ck/utility/reduction_enums.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/reduction_functions_accumulate.hpp b/include/ck/utility/reduction_functions_accumulate.hpp
index 724e5599d6c878ba08bf14429c40c0958ab9f2a2..b9765ff0d2cb57bb2a1a0c4c7684a3cd2e0fa008 100644
--- a/include/ck/utility/reduction_functions_accumulate.hpp
+++ b/include/ck/utility/reduction_functions_accumulate.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/reduction_operator.hpp b/include/ck/utility/reduction_operator.hpp
index b4e770a64efdda0b42e7996e2858d0f162231fee..0f5b73cb03363e69f44a4332c00f49e345733403 100644
--- a/include/ck/utility/reduction_operator.hpp
+++ b/include/ck/utility/reduction_operator.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/sequence.hpp b/include/ck/utility/sequence.hpp
index 97b597221c2850d4b28b644266978fc56b295913..d6bfb2eba1cd6a6de2e334261379504d8448b3c7 100644
--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/sequence_helper.hpp b/include/ck/utility/sequence_helper.hpp
index db25c27e70c3653f94524367b8a6bce79480113e..8c493a28221eae4e095f051264f9323429984531 100644
--- a/include/ck/utility/sequence_helper.hpp
+++ b/include/ck/utility/sequence_helper.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/span.hpp b/include/ck/utility/span.hpp
index 1e501214547cfbcec25921d6526e77563504835a..5e7567a847f3d18e424437005d876a409cd0d99d 100644
--- a/include/ck/utility/span.hpp
+++ b/include/ck/utility/span.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/static_buffer.hpp b/include/ck/utility/static_buffer.hpp
index dd25c96203288cfc9aa0959e9a7b1d83ef4abfc2..835f5657307a8e171b116c0bec088bb84b1b7b38 100644
--- a/include/ck/utility/static_buffer.hpp
+++ b/include/ck/utility/static_buffer.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/statically_indexed_array.hpp b/include/ck/utility/statically_indexed_array.hpp
index 3438776f413cf010664cc8aa4a18c09bf161fae7..a2d70045a4b9b221651d958472aa6a2e721ccccc 100644
--- a/include/ck/utility/statically_indexed_array.hpp
+++ b/include/ck/utility/statically_indexed_array.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_STATICALLY_INDEXED_ARRAY_HPP
 #define CK_STATICALLY_INDEXED_ARRAY_HPP
diff --git a/include/ck/utility/statically_indexed_array_multi_index.hpp b/include/ck/utility/statically_indexed_array_multi_index.hpp
index 21b2941b21401ac561932a45ec304536365f2927..4a8b96ae8a6b81400f3567957f07010c26c0bf8d 100644
--- a/include/ck/utility/statically_indexed_array_multi_index.hpp
+++ b/include/ck/utility/statically_indexed_array_multi_index.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_STATICALLY_INDEXED_ARRAY_MULTI_INDEX_HPP
 #define CK_STATICALLY_INDEXED_ARRAY_MULTI_INDEX_HPP
diff --git a/include/ck/utility/synchronization.hpp b/include/ck/utility/synchronization.hpp
index 0e247ed0f8df15b890cdd44ea9f1682d5823a89f..775e7ac3a392f5edd16054e63314bc80faca5d6e 100644
--- a/include/ck/utility/synchronization.hpp
+++ b/include/ck/utility/synchronization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/thread_group.hpp b/include/ck/utility/thread_group.hpp
index d469dec899a556c5b8062efa775b7dc694500fe8..1cd6b2f3ce26a9a89e98600f7377aac0d93af344 100644
--- a/include/ck/utility/thread_group.hpp
+++ b/include/ck/utility/thread_group.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/transpose_vectors.hpp b/include/ck/utility/transpose_vectors.hpp
index 2b0075d6005e3c5cf7d6772fda8132a6404878ed..6faf5c133ba2df0ed938aab915a272e3a7a590f3 100644
--- a/include/ck/utility/transpose_vectors.hpp
+++ b/include/ck/utility/transpose_vectors.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/tuple.hpp b/include/ck/utility/tuple.hpp
index d8664be550b79bc4de2edd224bc63350a0a1bde7..b616b3123f3b9ce3c92763cd36ecd39fc7d3e553 100644
--- a/include/ck/utility/tuple.hpp
+++ b/include/ck/utility/tuple.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp
index 6f5b142a5e7a765807bd9d7f556f7b8afc512d37..e39ae1c23d421e380b81b4bc7b487025cb6937be 100644
--- a/include/ck/utility/tuple_helper.hpp
+++ b/include/ck/utility/tuple_helper.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/utility/type.hpp b/include/ck/utility/type.hpp
index 90b9df2950b7d979f8cef386d155c72f6d7a39a5..9609afba43a78c05332d459b3a7cf756931a6e60 100644
--- a/include/ck/utility/type.hpp
+++ b/include/ck/utility/type.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
index 46a1fa559a1a3d164acc99e536f7e26a83b97ed4..a1b1e0d91b472e5531097b13d57d6efe5b9591a7 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp
index 0b621e88a0c6e890bf1efb026615fcc63fadf6c9..a2eabdf5c1c92375503fc8cc961a2cde8410204e 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp
index dd0db316804fd191ee73d9952f6df6779420910d..20c1fcd7367838022d7a4e2776f436ce796460fe 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp
index 463c655ac1d1414f80f9d8b7fcfc696be6326018..7d652fe4c41d044747c2c40a3d6e7383d4f1ff2e 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
index b0149d88fdb062806d3b34b400f9b6d3389419f9..24f754e5987adfa82f4b0cc0e41dc85431f94e12 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
index 225f7b7e36f3a5a0d70058ed7089fc693a9e0493..449734f43478f0b36215d9d876c4a7537675a13f 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
index 7d62158f00c0aa28fdfe90d4a9df537367b22717..ec5df238ab2c2a292d8d44951f67e2d8d2383d1a 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
index b8d47d218b95e3ff2b38c2fc965b6bdc6a2afc68..8f4182a2318017dc7239dcc32e34af5287e97b4d 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
index be22003fd90db20eaea80045bd34f908ed8ebb61..71c84a1f5cd43811105a4630e31ade8ae0b3a075 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
index f949f27fde973cc44eec3162c7625980a9818116..0b90b4b50eec00cbd4d807fe2f43b9280cd2bdf2 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
index be69f297b246798118688abed92c19cf7f988b63..9b797be92549703fff3397fe9f099ad3e76c8389 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp
index 28132aa1ebd78178774caba73b91ba3c947b66be..ce2a83da6163fb6b19e30121a48734096857c508 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
index fedd4dce62cdcd31c2c582ab034291e9e098bc3c..6a48528c543b3ad1bcf94f92effcacbce40a8151 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
index 2bac5bc5c8f5cd6f1629e8cd108253fc6e9fee01..9994a2f9f7c862a1bf870103f999634e17682ecf 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
index 14a3b8c3e6099e0f5a744a1750d9dc21e5d7b6a3..696fb5eaf1413755fd3990f5db4e9b34f72fe654 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp
index c04baca57491ddce5f4dd09469ddb116840bcecd..944f34007ef0e030e2017688477c27602df15391 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
index a4fd46c932c08f3d51df8db212e167d66b755c27..9916a03b9c8f2eabe20ef0c27d78def6f4b9d6a5 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp
index b6a9b0fb5ee4a3135750dad0d0818a0f6e439f3a..f949260ca4552b83ab6e8edafee9190d4f95cc64 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp
index df4fca6562755012200ff888677bcac1c90129a2..0b7887efb6e05f34b83a5bf684b5fedd3d78c385 100644
--- a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef NAIVE_CONV_FWD_HPP
 #define NAIVE_CONV_FWD_HPP
diff --git a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
index 20df1b3616a016029f4083bd821b4e286b7b727c..f57fed9c07c7843af0f77db7991ce077b3021245 100644
--- a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index 18864395280e9463447781576428c9c68fa8f115..851d9e497ffe6a0f7b506b43be841ccb8554bd6e 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
index bb5f971c779e870aa686a55981ec680933e383de..c3c8c0e5a76329071970a2ee9e81f297fbf11292 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
index 0b025b33c887a377402171f48ca1ba4eed3cd219..73f6004257135b75ac0bb1cd79f92e0d9d78c3ef 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp
index 59d50e1bd23448029054d64f9d7d557870ec9310..70bff278997501254d47595f28cb0d2d01c679e2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
index 593ef7cb965da68456c6526b08cd6f72eb69c82d..33653a30800ebb7a22d333d0ccc1bd8ed4506df4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
index e1a4391c4a7246377dee190e876aeb2800c5e3fa..28ccf61a3a1d46f3dfb6c7158192416622f7907a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
index 8a0b1b1fa7de54954a5bc375241bb7a0686b2e49..0a30c210dd818b7812a0ae52690afb5a7a991ea2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
index 34c86dd44076247e300d7b2ed409a65699652e9b..2ff64675c904f0f6e30c436e6c44b57211c00f89 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp
index c84ffcff8cbb8d4820f88dbfa3e7de6a04961bb8..0e1f6f04e8c58812651d3d549601952050fb73dd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp
index 8e40d60c17b8dc8d743e29cc1472a1e9eed9bdc1..8fd1c7665d143e9a0fb07f47f07ead18aa33aa88 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp
index 342ade69cdf499c8a8a960038c288d4921a813b6..f6f4df7e2ec0e34b6ac7a4b356649289f6a30c91 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
index 6a551c72683abaf3008ced53b11ae079cc4dc717..2ed8255f6d66b72cba17492351824a0007afe6dc 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
index fc9ec8d615a8c15541dda9e3ef404cd31263b085..5d9567731dae2ea2d491cbeade861486f36a1743 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
index 07d5524762f676224b98754612e84ccbf0f49c36..1efe073669846e40274897ec33cfbf932cb31715 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
index 2c529e06fc55c1fd9c71d16c7c137a086924a75b..dd8c1987cfe65bfba4e792f269004f52cdd912ad 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
index 8af400cb7ad4daa31095cea130fdb8e2f9b4a842..7e6267c87b4951c852239676302fbcb72f6b5eae 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp
index 3d0c34062cbc6c34bf82c40678e1fb56fc64fca5..b15139510935586a930cd5a7c5a091299dfddecc 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp
index c87ae159bee2635c903490b9e09096be8c242c8c..a1c006cf62a063a742a8dd29365a0aefe359282d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
index 0a8f2215ba9a4667726b891b3b57994256452cd3..adac7d0dcf4a7bee2a0315add52d4c85251208dc 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -22,21 +22,41 @@ void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(
         DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
@@ -63,21 +83,41 @@ void add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(
         DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Col, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Row, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -295,6 +335,7 @@ struct DeviceOperationInstanceFactory<
             {
                 add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
                 add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
                 add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
@@ -302,6 +343,7 @@ struct DeviceOperationInstanceFactory<
             {
                 add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
                 add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs);
                 add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
                 add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
             }
@@ -310,6 +352,7 @@ struct DeviceOperationInstanceFactory<
             {
                 add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
                 add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+                add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs);
                 add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
@@ -317,6 +360,7 @@ struct DeviceOperationInstanceFactory<
             {
                 add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
                 add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+                add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs);
                 add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs);
             }
         }
@@ -352,24 +396,28 @@ struct DeviceOperationInstanceFactory<
             {
                 add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(op_ptrs);
                 add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instances(op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(op_ptrs);
                 add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances(op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(op_ptrs);
                 add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(op_ptrs);
+                add_device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instances(op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(op_ptrs);
                 add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(op_ptrs);
+                add_device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instances(op_ptrs);
             }
         }
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
index 2d578cca4fb37cbcce9fe5fad6c79446379d40e1..99b2ad13152f0baa82872add5efdeb9b6a1988a8 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp
index 554437f4903bcae96f68f2a53097a6eaf88367f3..fd3550c2f01b733602e6ae84dcca34f74255c9fe 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp
index c07ca3134bb889e5d15fc5df521979d5ea2efe46..481915d00b7416a444eabdf92289d275a9f60ef9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp
index 7beae83cdc1799814a2e7f4a1a4e7fa9e2d59a08..de21f325a6711d1fe746905dfb7686ddb4ff7e7c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
index 8f4cd4d9685ceedab7c4a354621e26bb7ed39379..f80efd5511a379a300078bd8d2cb63bcc0ce0154 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp
index fbc5df98a4ec116976d491835e8a8411b609c992..09b1c2190facdd84e09e5e1d66f32b4212ef0ec8 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
index e88844694831cc0bc97b76ec7651cea998385947..534875151105ee04a147584e61623320ee18ce57 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
index 81b2b4fcf37cb1f0d05415eb7d56587f0a29f380..fadfd199517dc0ac866d63aa6582f7d7f547a8bf 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
index ef6920e52a2096a3f61e40435c6fa3ce64a17323..377ce083c74b2419a34b1c966c951d659bdaeedd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index fc4beb0ae9247d9ee9f573d345778d650fc55f69..a82ec543c64fd16b360b73d39647b9c9cec701c2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
index e97484a5add5a12c4c86be11782ed106f9f3ed61..b482e97ee0386ba8d7940481f93512e797aa2f4f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
index 199ed73b4c233fd83c747f2abec60b977b31c1ae..778f625d84cbba947bd4d81ecc59bb94ebfe195b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
index 367180dea496b5f224aad5276c0b44cc6b79fbb7..23917752997b42e7c67e667247e38a1762a50a8d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp b/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
index 44d89cf36182ebac282e9b3a9e18cc2fd83f63ee..ccb5cb5a9eb9e74932789cfa25f299707746bce6 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp b/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
index 88523c70399854b2fb575e8476cba795b4c300d6..3a006b00a5b04c2689d2a262aecee5623f02d195 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp
index 2fd7ce22f7010f9e7b738395b5f670ac14aa9924..2ed4b0d5f880d5e8f0e8ba26e7e2ba541201c63f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
index daec48050ca21c434d04540757cf47ee7523ff74..8a96f6707e35344762003a958bc7afb9b937c86f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
index b7d81021e2a8206de4b59bd9521cddba3bb49e98..e17bea5fdf05ba19bb9e36de450865b67c406ff7 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
index 2d54879ea6fdd54ef798e5212917b971a23746a1..9236b5c79e3079b0ee1f7c68091f15866d692a3a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
index f278cfa224bf230bbaacdb7bd9eb78dba56cbc45..7c1eb4e429c727099950cb8d722051431c4064bb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
index 550a7b034509af43412c0faf5ac2fb52fb8b2eb3..9930b1a6fa04a5225432a6f8b0499293189b66be 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
index 2cdbfbb0c2ee6da3ba452fe43998537dd64f4bfc..c9c1475f182578f83288a3b0c1e9db942af494a7 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
index 4e3fa81f75f41c842688f51310c6c00e50d03082..4dd5569cee34aef47aca01c967ba80aca4b9ec7c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp
index 7ca8bc258ae3d163586ca60e85f467d974e3f901..d52310a3f3b1b8aecfcb0bfc97f9641943cd9d55 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp
index 37398146b873403cf68999cf4c6bae3191430a06..025500764ea6730b784cbafec1083cbc2cc0af51 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp
index 5eacd358c87ae5d0a82e9c06ca82c74492c744f6..2314e94980fb927b034249ecbafa7877056e6cae 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp
index 94ae02bf3dccaa6df9704cc3f114e2a41abda3c8..5c2bff16cccf6377d347992ab1224bcf27ecff0a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp
index e41e8de6a50a5d0b37177149529edd9d289e6ff4..a1279eeccbf0e38f3862b54aad87204ec44bf2da 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp
index 99762aa64b320e3f6d80e6a380f8b7bff4221e41..aff6a2542bf7a195b090c3a3ee8d1ff4c42f2776 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp
index 1fc557a95dbd9b12081583f8ca42c35fc415a20c..be8da2243fae00df87d2d46d34f1d709dfa32e46 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp
index ca3ba4eb0beb4be251a358de6bbd8f22b1341859..652984ae22171b9e0b1941d8379df994cbc30a9c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp
index 28a85782d13971cf7b3b49210e291ea569ab27e9..be60d1b32db50457684a8059f25cc29dc34f6467 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp
index ba74400793fdded7b24f82beb49810f5ac0d54b1..27e3aa53e38911afaf2d47c6900a1fb09ab4ec89 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp
index f5c813de78156975f2bfe4be4312bf05041620dd..f7f4870a97e98b473773ec70e167bc4818ab469a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp
index e25b6e84938e8ebeac212e95a78211b6801db7f0..790b5a92ad7532a26d4663b37189e6d9346fea27 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp
index a264d11262508751a4121f257b0837b1af7ab4a1..ec3bc852e8e97e517aa6f18be36b3a40646fa542 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp
index 8b1d8c95baba5f56139ac26b0539d977f3a5e3bc..8c0c065670273a5a7ade3ef4f234236589f8ae3e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp
index 49a60d88c3fbe4bd5eb6fcadcd8120c722ab3383..8631495df12d6cf9a9154b2a976c80a719352d48 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp
index 04a7c2d238f9cb97add1b6b615d7dda83f84c933..59849c2d459b6fd8fbd81adad00fac4493e19686 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp
index d0feefb50d02d0e40e022207fb5200ba28dc8b43..33cbea85efefe3d7509888dd3c15d07dca28dc5c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp
index 35f35f202c3055a49652c9aa8542fac504d024da..386182957514384e6c68fbacc961ade931817392 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp
index 63eb7221b5025f978c40383486a66bf45ad2caf4..f7c05b20a7ce8ff8acf13de3b33b2f52033ce2cb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp
index 1bca3c1f432b4b403c6e71a8831e9c03acc83368..2dfecc6b07b2819efd80135a0269d820bb2abaa4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp
index 1791a186f5a3fa41481f2674f1e47a0f07fdb5ab..a687938965e34a45c05d356e94983494fb8b6570 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp
index 3f56c057ef636a67a66f7757c4f9dae4bbe03a43..cf9c268a6476be9b62e76742a7d35293dffcbf0d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp
index a3b8bcf9a0a12d1bd10dc9304c045af3ea66d481..aeb578f948cf2e3d809e44d2975baceb7d20112f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp
index 18e0e084d768f2f1a2a11b5f384128910021d5c6..73480262c66e99af5afd1d360189585f7fff7410 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp
index 4a106463a31eb2a259b02161f29745bac9490d99..74293553e1a168b0a75592bc4f77d9325cfafe60 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp
index 23e1c49fe927cee5384b0a37ad3dc26206a5c694..8a91f76b34faf0b52378ebdb3462d85c20631a12 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp
index 62e2d24f0248319527e8dd0d905098252aa7f1f4..0ff2c30f346047d092d0ecd9b2e952fd99fe78d6 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp
index 18a54d86862de4837c843301e53d6ff90d3ba419..932299008eba691e5b2f704ca47a00b9159ab5ae 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp
index 9f408906a7c01d4e1d0eb7eb9eb12c7896b9cafe..c902e80a2c731f76f4608d4501ad711a1f1a68be 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp
index c40052562f8deba18c7ef5170e2a1f3236cf217d..e7a1369277cddb95cd9da2d06a2b16fdad643423 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp
index 532bfb417e5c9ae706b8d227bccaf688f50fdcf9..2d1ab69fc75f74ae9128824f842b1e7f89501db3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
index 8c08e5ef2f0365e36125d141f95474573dce5bf6..9f782c11b6dd306c14e1027ff083baa17821b5f5 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
index 0d08377a226ab06f974e74d63fb9761e579918d2..bb45295ed82e8ed5d6602e3642002f6783bb31fd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp
index 4cdd45e85b3781f08ea257b73d5780fbd1aa406a..5814b84bb7b295e261b86817b4698ab01284ad41 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp
index a36cafb27a02bce42b08bf70c512b2cbbef3e2b9..f1a609a6f483d979c394ce5b3bd74b19eb14a803 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp
index 13b0780497fafa06281a5dd4ae196c6695844ac2..489bef1f561102eb05e9bfc58da3cf45e6bcf54a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp
index 75e1f102421f5cf916aeeea39a7bf2173341aec8..2507d82308da420c51d3c4b1e2a6071aedf186a0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp
index 00ec17fadad0527bcf0713350c189f5aef228728..df4b7b7b493516a1e01fdc2be435f255bf5cd024 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp
index 7b762bc93216ba9cb525849ddd5c618fd097bfb9..2748b9b9d0434949c50c1c0e4f2e76f2b25bb8af 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp
index 2a2b284b226a37d07c38db64b3ab2686bf2c30ce..26230d5456c533175b62e54ead132486c80b494b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp
index 444d8ddc86b65cce2961874ee7c64b69a9573d2f..d5e1499aefc0ada2270f6efdda5776613f9a9c5f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp
index f3c07017616adaa8bbd529569d2d951b1a9f2a4e..cbc8befb3d39c36a199159b9a1d3759e17f2ce4a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp
index c57edd0846644d1d43c2d6578fe57b3b7795f2ff..574cc3dd3e015b4194cb0f584f61dbc8afb6e4e5 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
index 325ed1e6d400869b29b283a2af2fb072e53487cc..04bab9d283fa83f57eeae64831966fe115dde9c2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp
index 8960ba7c5b66b831b555584fe2bf4af4ee11c024..6c50d222ecaf19d1026f7cab1478b413de77af10 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp
index 95d9c072657cf08dbeda045ba4904bc4b1650883..5e91f53b0f99864a874fd81f9889cf721780c980 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp
index dd6734061eaadbaaff1f8134e96acddf29f22d41..171803156051497dce6d05bea27663fdc831dbbb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp
index 85f75110dfb4ce248cce75434f425bdf7c0c3e39..03bff5912dac549457c5c51b1e5b501b9ef18f50 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp
index 7f62f4e01064028156e6d00dbe70ce13102c1d11..676af19336c5ca205ccabf31028bd57dfe577d49 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp
index eee771b13373499d313d4c072608a86d966ebdc4..c67c0a41f85154a9ef5c230958e0e88893d31915 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp
index 64f1e9c22be3cc0a8b8f5cf2f40e795a112243be..c9058a9b1d33a2f11f0da600edddb436a2941305 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp
index 078561e1539bdc5113826c6220807c858186c8c2..1fbd3c6e7de827c71e01a601b7edf724cef9a6f2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp
index 5a9144186bb53fa8a182b5034e0306be2edc7bbc..89496a4ed16926eec96ae1b5ea560b3b1fd90637 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp
index dc4740aa3de33e40f3c2bd8506b7d27ff92669fa..fb79ce8adf39c6fd732d59c4a19f82d8d253cf07 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp
index 9ecc96797ffe267ac5d5842450ecc604b65e14b0..7915bd2630a25aab209cb42ba69087b54a1a0d76 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp
index ccce78e2f1541fd948c6ac6866b747ce6d08e928..d769fe8154918b80a2fcbadea95ffe31c57cb128 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp
index 6d3749d868a8fde91ee7c96c57fbf7b6f450eb6b..49938d4434ca072b789aaeac75a8903d9afed112 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp
index 7594dde74dba43a2e13cb8317754563bb04efdeb..810249994f00d95cb00352eb28ba464f6ecd7856 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp
index 3272e7f9af35f50dd5cae8eb200b6f70f5251678..6640ffec700bba666962436a63b62ab828cfc719 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp
index 519ec8271dd82b83597ddff695ea00588dfbd0b7..441bea275c164c60ff9e73af2a30c3a021be0090 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp
index 77b2fb9306bebc9cd061ec94e3f074aa5aaeb897..05912e1e960f33cef0d82f56798326e3dc65ea2b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp
index 5abb5c5eecad17c988c5b5d21189c29dfd50c0ee..b0e16cfeaaef2866134c1dfc5d2a22908404a499 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp
index 23bd988b8a5d63d5aa284514459a206c7a38bf4e..0ee0f7d0fe156e6fb9a618e0bc66d2a9f4e2e7c4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp
index 7ce5577d7faf64a44569e9e97d3c7143378ab852..feb9d99eba11da1c6b2cddadffd80d3fdb4cb7c2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp
index 7e4c5b77f0855ec5fcc7e745b5ad72071b8607d5..a9f0d77cb885b3be117c0621357d12ecac0c599b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp
index 5eca5fea7f83e7a9616489af2ee3a3b5fd845b45..bd1f72250a36137736e7bdd3dc5a11d842103539 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
index b0e98411bf5ca8367283624ebe068785668ac227..60189fd3047b77a3c4ef4f34be9a75ab69dc73ac 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
index 84609a995dd9f00c8686322285842b030298437f..b5aa9ce613228d41a89c2787b8bf275f429fcba0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
index 2f816bb11dcb7cd3d97311ccce038531271db282..0e20a17ff98e593f93c91861b47449ad6007a6aa 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
index 9cecd4a5b47afe8d0595a6aaec13b39042ed4c32..c8c1cd7e802022268d323582b7955a366ee743ef 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
index 42e9b7fc792b70ce5ee85e22cce577f7a02d2ac6..3e71b44673994b1a0fd460191cc29c36b7ac6b97 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
index 494f1c3d71809f4a1302a1933e26cf89a754ad35..c8286e56d19377ca1c4a9b4fa9a714687a0b5fcb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
index a80abb9247d595fda58e5d1c2a461bc009831b79..36fbc52f113c43a71d6cc0c647b01a16f2a16c4f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
index 53fd286383bf91fdaae001d34be26e3bafee8d58..0900cffa353f778e8ce544ee7efbbc1ec9e8fd0a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
index df5a4db484953c0c7bb00d6078dc701b681d3950..726b56066b000521a697989b4f5eb807a998aeba 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
index ed78acd926882844a94e13ce6bf0d6431380efa5..0ae9838607d53897431229488e177229d88561b8 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp
index 0038fc26da53d55648a4374af3a02cde6d86184e..a3a39b7ccc7216b371a540cb8eaebee075cbe4d6 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
index 36eb092f0f0707b266ef9a1d7c3b5248947c1e87..c5c2d2cdd7cf9e2b26a9fdb130a501aea8cdc15d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
index 83f52fc3ee7a9547161ce1284c2844976819c0ec..7c6f189cb99c7966d33281f1f058a336a73c00ff 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
index 046ff5780556331a6afc697f87dbbf02dcf1ff25..33d5cc68390d760e9020021c47f92203665958a5 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
index 8e6a226f6a13fb1dcd7d61b56b476f38072b6a98..7668248c3fd6bbdd7df455b76183ebd24713ae3e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
index 518fa5f98679099189f15dfe5a57e3ac0324e3fa..20eb7bbc9476e1f102b5a0841a11ceadd9939316 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
index 10016cdd707cbcb46282e540e15e6b79a67c6a78..e8356a929259ab41aea7413428ff771f1838531b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
index cdd5a3cd7b6fb6806db3de0a10b7ed87b29c5c2d..b3f7d4890b9ae7fb1ce3e6dd3d651903e6219998 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
index a8be272e02014495cb42d0509b50075f2d7da707..4190f50a35156855dbf2311eb1e5fe88524bca22 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
index ec8296ff22fb6591152a3dcfad63c250c70cabb8..b7f3344905f4e6a2ea068d2ba4a8e5f3d1a64508 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp
index b3877c4bb3f6217427f69c07f7073751b254c938..53c142f6120ba49d53eca5dd1970885ee3ed2200 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <tuple>
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
index a6d9a359f4622ec9534ab090d46e363bd9101177..41c67af7ade0f35c4a5d6f017fd8cef13529e737 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
index 6621a2c867ac90402a853e5c2065663a96220927..2d791ff979072fe966fc6548427c9f13ec5dd001 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
index 3dfac98ed8beeee6060b6ba3f6b9b1ff19f660fe..eb9cc1ee2bdcc03ecebe36530b486569acee6d58 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
index 6d2a0c932500e874641d8cef9fd65b4d280c89b6..68af443a54a7be417f5678c70541f77a6ef54141 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
index 97dd3dcb18aed5c228313a961994096445948bbf..3bf8704b45b8611d316c175676b5535133463eeb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
index 58f8760acccd1b571df40ea9a08946c1f5f9764f..43e54aaca7e759ad8d8608ff62e3c0d31de63746 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
index df8d31f0da76b0f027b4ee4818006dd331e38454..32c4cd74b561993ae3e6f1ddb8f9e7ec6b1f308c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
index 1bd773227e172278a466d7650e90fd39ec3e56f8..f8f5caddb85e164b7a3b65978ec79b1425b9554c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
index 16f129d2d07c19a926c75f1e878e8db3fb9bf821..a034e41a072b52e2d15233b751db7546622d5f0d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <tuple>
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
index f80f712ff5e501d6516d9c7210a59276a96b6541..3cd3742093f052e5a0da94c9fcf56b57c956d45e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
index 6f9952e7d58a448cfea0e3c6ba84af24e9de4e52..f7d4dd045a166575226c3b295090dcad9c165c42 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
index 2cbd13a1ba5226316363cfb07dfd23adfac4aa9f..c49dd4d8581d33402722966806e187c0cc14d3a3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
index 7b12522a85987f85e83a6f8afb34b97844507c49..4074ee3b1f42883326897590447e148421f637b5 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
index 54d477f80c5b222cd5a76430dae3bbd4db81068c..479fcc92fe1e8a60833ab5d9a8b1d069e9aa6135 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
index 4ffc44e3a9217e689297bceb81bd6a38c41cd1c3..0dd644fab78185413758d5a47c63df8b68f4851d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
index 08cbb81272f9ff5e7ce9a9fdd9075d81f1ed7211..50f39396abd7de0335d44acaf814c0e7f9c6e900 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
index 187d034b95ac506426da4fa1b8240b61e2517e46..defa2dbda61a1e53b8a7231b8ebf7546910f23c0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
index 7fc9ed69198f1e7bf447a6baa9dedb3286440a94..6ff07de2360fcd762de3ef8c9b9956953d04fe7d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <tuple>
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
index 03be6e2bc7c15ee350569f7ed97ebf2d3ad3b515..206980cf1046e82d3bd45685fbafa41f6e67d072 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/algorithm.hpp b/library/include/ck/library/utility/algorithm.hpp
index 86f04dd362397e774c9821a16a7c412dea000937..57136f8a2a19fbdaa2e0276699c786c33cc61473 100644
--- a/library/include/ck/library/utility/algorithm.hpp
+++ b/library/include/ck/library/utility/algorithm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp
index a89d03d324f33f0be95f88c328a9d7aee38e7460..7f63a81a020617a00bd91ab529add56dd2fc55a9 100644
--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/conv_common.hpp b/library/include/ck/library/utility/conv_common.hpp
index 6fad9f7d77d349853e63512ae486a57e7ceb7b6f..085454f42dee006cf7c48ebd18804df92fa36a37 100644
--- a/library/include/ck/library/utility/conv_common.hpp
+++ b/library/include/ck/library/utility/conv_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp b/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
index 2b4f63b28b8d2f1e59cef67b99a577314faf4ae2..ff697fb71c0340df3f3aac6953ce76cfdef4b278 100644
--- a/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
+++ b/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/convolution_parameter.hpp b/library/include/ck/library/utility/convolution_parameter.hpp
index f4a2b56f75a6b40a476080e74d386fab524286a4..df6efca10808beb96535ef69a3ba6116a449b9e3 100644
--- a/library/include/ck/library/utility/convolution_parameter.hpp
+++ b/library/include/ck/library/utility/convolution_parameter.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/device_memory.hpp b/library/include/ck/library/utility/device_memory.hpp
index 87940e1671ab71b0e418c82fbec1f95d6627a8b5..1c16ff59165af029e2f81d8364daf07f844daaac 100644
--- a/library/include/ck/library/utility/device_memory.hpp
+++ b/library/include/ck/library/utility/device_memory.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/fill.hpp b/library/include/ck/library/utility/fill.hpp
index c0bc3727641041bbd1d9fb0c5e7456c5042b6448..c01e139ea0b8f160f790bde4270640c261affa6e 100644
--- a/library/include/ck/library/utility/fill.hpp
+++ b/library/include/ck/library/utility/fill.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/host_common_util.hpp b/library/include/ck/library/utility/host_common_util.hpp
index 6f4466e8da0e2b77c1ad981cfb6f730a74a92d71..20a8f234dbbf9f30582ed371c5f1a90cce2b3196 100644
--- a/library/include/ck/library/utility/host_common_util.hpp
+++ b/library/include/ck/library/utility/host_common_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/host_gemm.hpp b/library/include/ck/library/utility/host_gemm.hpp
index 44036d0234375824ed3aa773ffe71de02b84c570..5eb7e3b8c9745c48390482622d361a2ddd491159 100644
--- a/library/include/ck/library/utility/host_gemm.hpp
+++ b/library/include/ck/library/utility/host_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/host_tensor.hpp b/library/include/ck/library/utility/host_tensor.hpp
index 844c29ed14ccbc09df5f5f71e9ef2e99f6e8377b..91293d29f7f11e2ffd53389aa3ea40739761cdef 100644
--- a/library/include/ck/library/utility/host_tensor.hpp
+++ b/library/include/ck/library/utility/host_tensor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/host_tensor_generator.hpp b/library/include/ck/library/utility/host_tensor_generator.hpp
index 4259862e65e331b30377997c512459e2a21e9383..31ff13aec254a166c6439fdef3fd17907017e147 100644
--- a/library/include/ck/library/utility/host_tensor_generator.hpp
+++ b/library/include/ck/library/utility/host_tensor_generator.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/iterator.hpp b/library/include/ck/library/utility/iterator.hpp
index 9fdc88ea76825e2a1cdf659f95fa2928038867aa..b44e2d8e3c4a6f8b4423b03699539ade2fbfb0d7 100644
--- a/library/include/ck/library/utility/iterator.hpp
+++ b/library/include/ck/library/utility/iterator.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/literals.hpp b/library/include/ck/library/utility/literals.hpp
index a73a2ea054150665557b8e1da473f2cb9877526b..a8bd6303f12f0a36bafb36c1d7f1453eab13392b 100644
--- a/library/include/ck/library/utility/literals.hpp
+++ b/library/include/ck/library/utility/literals.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/numeric.hpp b/library/include/ck/library/utility/numeric.hpp
index 70a7e87ab1c8d94569e705b3f6832c78bac05853..9ee118d4757109416bac18cb54229e93de59b75c 100644
--- a/library/include/ck/library/utility/numeric.hpp
+++ b/library/include/ck/library/utility/numeric.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/include/ck/library/utility/ranges.hpp b/library/include/ck/library/utility/ranges.hpp
index 55c322f1ace041b9c4f346a5df4250c2323759dd..f11e4204a0cec5378328f69d3bb553a6227c47a3 100644
--- a/library/include/ck/library/utility/ranges.hpp
+++ b/library/include/ck/library/utility/ranges.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
index cc8787458d7ffff072cf8c055c8def43ca487e51..e730e9f589faf8047dd60a218ef191eea510cb63 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
index 04200cfb52e26f227d89852e2845547e6624c22d..f6696ffa9f6ca2c0665ecde3f0394c0802588a46 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
index 7b86f3cc728fdac1117cbe927445b6830e81e9ab..32d6f258b5c9152205f8b13e0326739b567b0845 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
index 2afb1afbc3d2355f682af9cad21b62c0ab1ee8ea..ee246ba56edd7271d7d2f980c62003a7ab7c7f06 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
index 68d768949c859f5ca8aedfebfa888253dc6c276c..5a9483b3099d9985f88f0121135a79d55edefd5a 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
index 737e5bfca3bccbafcd8179d80bb43b58bd545639..0fa0719237b000b7ab9b13d94cfa951edacf3139 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
index e09d01736083f54cf2c1b848814b2d7fb683d3b8..2f42f62b0b14b91bf94208b27684ba4b0b10c5d2 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
index 984d66e2884273e006772530e6a978891766a8e0..10b4cea7d7cddd4e694c0de42d122bb7ff3bd505 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
index 12cada9c44d61d8b81ad2dfa4877dc57b26ed681..c687eb20f06918b41ca54fa72a2a577b809eb2cf 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
index 13f198862e5583b1f63c51efaab11fa733976cfe..b19374ca6d5635238da838808d863b19fd1b6b93 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
index 2ca1adc2f6d160a97a2f40b4075f07b185e68351..bbd318ba9299005910d63a83d6906f4b69571479 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
index fe5de52796f6c9a0c94a84d883a7b1a714c4a652..187ccb5ff62fab57a83139f340a4032fd03e8dea 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
index 5b55c8e15e01d879cc11e57128c3e2dfdf81890f..ec2b2646ffa1a78b058a90b0508eba7f1d65a2be 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
index 9517e4577e5b9af188aaf07413e8c0b3cf6c32ee..d76cd350c81807f330f7b04ae2ed2d1bf6bb5080 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
index 43b91244060ed90c9f2558dc99279ce0a086845b..ef65106c2cf537ae29dcc7313829ecaf3ae0ed58 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
index 326500fcbf013e8540fff9252202ac7af33b2938..078b241f9dfe194778806824911d791c7dd7a2de 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index e1bfa88f49d4fb886f953661c03b8638516ba646..4db05589b153b2307d377e67147b5ee1bf8336bc 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
index f59b742534b2c234b2b4908f5e540f591b104619..e25f903a8272ceb6bd559b8edfdb72f7ac759716 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/device_batched_gemm_bias_permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/device_batched_gemm_bias_permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp
index 04a748f45513c56ce23781dd08bcf4c2bae03331..a0afaabbc7bbff785c7a3e6cba884a0bb1f768e4 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/device_batched_gemm_bias_permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/device_batched_gemm_bias_permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index 9b96194c874c3daea7c9c4eb931368ce5645b50e..67dfc4cd35066053c7b9936a76c402454cf1e99b 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
index 0713dfcd99ca183d8f64cd23ac6067be990b39e7..9001c901cfbecb473c6a0e4f70a9db6526e2a4cb 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
index 521c3d9219b1cd23e30ca194d3e22d1f38fa9316..cb89d3cefd6525d017d7f7b4685be456cb1c7e44 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
index 231d612d781bc93c2a232f7e1478fddfb09c1b5d..91eefba0c10af87e7674cb053ee549a2799de7a9 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
index 165bc3957d36479b1cb3052c0085ade58fb71322..c20798f557e58e870f748d7900f1418f3a1a5779 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
index 832fc3b066066f9ac8faae30cebff714b73b53c9..3d9ad64b9b40a2271ed40608735bffb0133153f9 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index 99e8712474996a6c30e8bf16ae69f5bc15925364..cf23d01bf234e2cdda3efb2b655879a4740f5c27 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
index f73e3dea84ebe2f4e70af03df967bcd3b2bd7450..a541a2d224c316de1197d87c9e94feeda9283a03 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index fd094719584147781d4246a508ada7104cdf12e5..5e481603ca432e0ebd4e1ecd69b8122c8b160b5b 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
index 53ad7ba5ffa9272da26189a45a75a9453af68f48..501ea85f60598393a8a4d212a3dddeade8074688 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index 291a127a6603bd5fe1256076355df76d133978e9..91ab541bf9cb354a5f196288fed33496aacc48a0 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp
index b62c8b99cbdb6cf42922e843c11a0832614b6ea7..bc95d2f1b1f1ad3734daaa19ed5dbb1faff5a6a4 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp
index d05b8b592c29aceef387a4fdec8c02854501995f..fbc8d0bc6097a477fe9920201167a58142aa9b44 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp
index e3ef95d12e17acfebc029ac6e1a878194691739f..bed38658a99c109370e6271dc94f785d94268106 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp
index 41be396c24a28b5a7fbc47b10d7cf6ae5e6e07e9..fc5ec77e428dfbfb09a902245f27ca25b8bc0d35 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp
index cd1e05b1133ad2a2515a936ef968667a03bc11be..4e38ee13b24dd150f1366ed7694e69c6969a2e2c 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp
index 073dd583f97e0eed167e4bc4ef952530e82854a2..f087eb79824e3e36c6bc07a3d42b68f38a89c16b 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp
index be63bd44c66c3204c66990ff6048bd895841c84f..d0f361401a500c16f1228d4df1d3de6714d1f43a 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp
index fe87091e8d66e32af0832c1b013abce0911673b2..710d07b8280cef467d2fac8a70034129feb13fda 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp
index 2e695afa9786d2e4fdcc6d9bf05c600c1a333181..8801c309f9138ccf321ba1dcac5f1718287e0a76 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/utility/tuple.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp
index 9ec761e445ac30cb482c35be42846c3e46a13ea7..b674cfc423d81b5efce31169e1c81741ba156b9a 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/utility/tuple.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp
index f0d26c36bedfa037a439e6226e8b1db4051d0fc3..05e3650887c3dfa84d1f03a8380d1144c7886acf 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/utility/tuple.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp
index 9e4066bb060e0a58342956860356ee589bd5e9c0..15a02af021a22b4e9e43186a7b3e18b938534dd8 100644
--- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/utility/tuple.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
index ebbff88346b9dcb1eb4716b1b36f456b806ef1ca..5587db77e08ceb40242084853ed07236cc992605 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
index 980383f3e71b3f1058c295c9aba55c56f596134a..26262855ea8b6e3ba2c929e527e97dc5108495dd 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
index 2d4b6e3489a9ccce619b0a76dc097f04d72b0d57..befc0dcd10d07735ec046e8ef4d64bb54be4756c 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
index 7caa469f54b0169449250f000df038820e35a32f..e45b47cf94ed53de17b862e50ed2ef7a00e3e978 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
index 093b2f0e98bb5e12d867bcdd58c4817e2cb33111..f437a227d50b665b999e30b2d065ac6825e7ec04 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
index 0f683e5c280306b8c14b461a4d46183055094afa..13fdbeb35c53a93947f739dd26f91885cff4a72b 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
index e384993aed76e58f11238cd2d5bad3bb4515e457..95ef8c4929b00263c65034ac1ec48195efdc8fff 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
index 92e39c173f55cb30d730415d44eda4eb4c1c3b32..290f81d7c9a926dd4e9c169e3515ebdbaa547788 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
index 5118d0d033ee733c9b3bd7bc3524824c77aae667..16fd1cb407bf81741845c41baa580f208e98508d 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
index 655d4f0061ae1c417e610e9059f7e2bbd2048363..ff37bf7cceb40f75c8a54309341914539057d678 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
index a9d20be18bf5c52044b12f84c58f7824f8ef8142..8a1f6f93341edbb585157cd66308ab8f781bfedc 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
index a68f5c9718a1e547c62f16888a4fd0a0ed1d8197..d333f597268d55a195da671fcbcdaab2c03acfc0 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
index 0aa927155a412fede87f08fde619884f3aaa3405..4c87b51a9e426945f049d471bf3591d1f222b72b 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
index b84ea274c5022c173d3a396661bf78396b07619f..fd3f57c6b6058f16e9e34158fefc71e3fb9c5f3c 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
index 578469997acf056af0ade6a6d92872aad9d81b3c..1e53f0b2f51f23143b1d49d1b7d9b2d3e2b7b765 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
index 8e5a19313ee4262668dc4481139b6284ca898538..d02d146a9bf98b25fe7de56148c8879dd62a328b 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
index 5a5c8384227150dd8717433dce6c9ea2395896bd..e3e90c966dd6efd920ea0147d434f926bd07bd05 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
index e0f3d6199f8939bb0f1ce7f8efc48f2a92cfd3c6..81e9122d9506eb3ae78602c7beedbd2ce1b0b745 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
index 30537d9373b8175207b267666574033cae4e1a3c..dbc82168f49ad102886ce8aebaea72cd3ef5a99a 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
index 190c39b870b3cb33e9080c8b591884f4ec387347..3ac250f3e6a9d3fe8b3139aef5a79dc520269381 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
index e14cd558628772991da9b9d4ebb3dd5d852e3285..fcb858728e80c2cce14579f5a35409316deceeb4 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
index f001b83c1711a7185d71f5b2c7f795005b35b696..2baa6ae061473a44e962c597d2fabb9d62cabee4 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
index 83ba6a1c6bb8a3098da1b31a59a1d4a12b8b062e..28867fe19dd62f63d2e3df798a210a1045415fb8 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
index 1da9a81d904b3e43aaf5ace4f612c43c05e66ee0..40656e382b8074580de72fed78123c07b5e1e303 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
index 7c33df5e7687ee3e279b350bec9ca17902e84231..422e37e926fd7a839afc2cf884b141ca32737078 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
index a5f8629f2daef05ca1dbcbec2e8c4c668bf96a84..5993f6bd7a28c2992bf2e069880e84bf75612dfd 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
index 8076d6d35669114d84b922a59948b888dcdab506..bb9b696864c54265a8d3d119bcd55d82452a825e 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
index 33503b9f8ae00a92c25b178629736c7e8a5474c3..da96c79a6e9a44edd05928b54a865483f6a1e1a2 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
index c5e4bd199ea8322312af832b63a0ea559109131a..78e9c893c9e82a723d1794685c7d33d78129047d 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
index f43d13e309333bc86abf22465b7f035c1445321b..4663a9ac32ac3b6424d638c2d6bc7248651c240a 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
index 0ce6b04c4218190ccd3cad238f1bc90ec3002df9..0b1df52f8c1ce4503c154c4f7ce40b713455e839 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
index 76ab3189d7f210446b305a3daec80a2902de63ff..9969a8bbc90df83a31da50f8f064cbe0a83923f7 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
index f8c255088877e95e2cbbbacbfba6f9e1f4fc0291..e34ea06ff46ba090a212575e66dcfd454a44de66 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
index fe7152471ed54aa5334e820f993232912013c6de..3254fcfc26467edc3ec6b14d1838e784d7438529 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
index 04ce7c07639c968d90a8053f4b67b1335acc9b69..94b2a47e50b635e5b862c109f479660183d2bede 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
index 0251d9157f3b894208b17091ab6c79eec8bf605d..4244ab7b8770c9cbd60957c4fd66f004b7f5975c 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
index c2975727e48385207678abdcbc1fda776e761664..5c7db4ca3bc4dce89cd1d139722d1e91758f8d7a 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
index fc86d7302450a01263dc07d8f895c669795f3644..ebc56487a159afe8b32e5e321c6996c38250c32f 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
index 182037f15c66c2d196e86b84ca0f0571be2922e9..a62c9e2354281aba993918f1b8351876bce46532 100644
--- a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp
index b160d4fe1a3a271113899b2a8bc57e5535706280..3e2386ee03558ec6d346abc73a037e65e2d9cde3 100644
--- a/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
index e20d592c84ed8da549f29160c87f65579307096a..d66010af734581ae204015e7f3d663040d0b111e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -33,11 +33,19 @@ add_instance_library(device_gemm_instance
    device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
    device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
    device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
+   device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
    device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
+   device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
    device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
+   device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp
    device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
+   device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp
    device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
+   device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp
    device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
+   device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp
    device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
+   device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp
    device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
+   device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
index 5d2f18e14e37be6b1c5528e8f43f23d1b6c88996..ea99a5a30ef80bdb954df78535a6cd63b8663bbb 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -34,7 +34,39 @@ using device_gemm_dl_f16_f16_f16_km_kn_mn_instances = std::tuple<
         // #########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
         // #########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
         // #########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
-        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<4, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<16, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d444e29aa33538874b306caeab0fd78e177ce807
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances = std::tuple<
+    // clang-format off
+        // #########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // #########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // #########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // #########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<4, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<16, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
index 01e3b3793a487fd5eb6b6b5a6ca6a10e86bd0588..b83acfa8cb4db572bb33ccd20e6d38e7594f0234 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -34,7 +34,39 @@ using device_gemm_dl_f16_f16_f16_km_nk_mn_instances = std::tuple<
         // #########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
         // #########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
         // #########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
-        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // MPerBlock=128, NPerBlock=128       
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f8f8a0bd333748899bc52eed1f65ff6c56467a87
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances = std::tuple<
+    // clang-format off
+        // #########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // #########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // #########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        // #########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128       
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64       
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
index 804e86a066ffa47c6d96de81cd4bef5644a5b131..d5800e03336e9161e01ef240a46aefbf9b13c2c0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -34,7 +34,39 @@ using device_gemm_dl_f16_f16_f16_mk_kn_mn_instances = std::tuple<
         // #########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
         // #########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
         // #########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
-        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..840a4fabe1c211689796debc86e8ce63e0c19b4e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances = std::tuple<
+    // clang-format off
+        // #########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // #########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // #########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        // #########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,       S<4, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
index 159fa90f7432248721f3bfef606f6df3b04584e9..abe52ce1da0edb191755d205a4e744c274ae56b1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -35,7 +35,39 @@ using device_gemm_dl_f16_f16_f16_mk_nk_mn_instances =
         //  ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
         //  ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
         //  ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
-        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
         // clang-format on
         >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..82b1b5dc29fc4e5612236f2e0271597e405dfe80
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances =
+    std::tuple<
+        // clang-format off
+        //  ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        //  ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        //  ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        //  ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+        // clang-format on
+        >;
+
+void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
index d8e7798438dcaaa9d7c2e48e8074a2f58a635438..e696bfdcdc07f6799a00b9463837031559b715a5 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
index 0034ac59c3888d8dcc1b2bf8e136ac4f02c2e6ad..d3ad7c60ec6f8cf2321aaa03e18e62ac28e65773 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
index 0b540b8b3492c355d75adf98cacc1826417db0b9..a56a36b0ab9cf0260a0366560d8816f18f1f5d7e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
index 4f6ff5111b1fa5dbf39e4c60d1770a319aa9dbba..63d55e81d6a5cb1157ea72a97e01599e6fc750ff 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
index a4208245e5756ccf3a2e6be29c3a5c5f73e97acc..3d9a265c29b35f09699fb12b578942a5c5f9ed89 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -31,7 +31,40 @@ using device_gemm_dl_i8_i8_i8_km_kn_mn_instances = std::tuple<
         // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
         // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
         // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
-        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  4,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<1, 1, 4, 4>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<4, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  4,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<1, 1, 4, 4>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<4, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  4,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  4,          4,          1,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  4,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 4>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<4, 1, 4, 4>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  4,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 4>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<8, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  4,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 4>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<8, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  4,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  4,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  4,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  4,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  4,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  4,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,     S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0439201511f285a0d7eba405df72d1942a4fb49b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instances = std::tuple<
+    // clang-format off
+        // #########|  AData|   BData|   CData|    AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<4, 4>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    32,    32,    32,     8,  4,          2,          4,      1,       S<4, 2>,       S<2, 2>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<4, 2>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    16,    16,    16,    16,  4,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 4>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 4, 4>,       S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    16,    16,    16,    16,  4,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 4>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 4, 4>,       S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,     8,    64,    32,  4,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 4>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 4, 4>,      S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,     8,    32,  4,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<1, 1, 4, 4>,      S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,     8,     8,     8,     4,  4,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<1, 1, 4, 4>,       S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,     8,     8,     8,     4,  4,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<1, 1, 4, 4>,       S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,     8,     8,     8,     4,  4,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<1, 1, 4, 4>,       S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,     8,     8,     8,     4,  4,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 4>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<1, 1, 4, 4>,       S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
index 06fab7f68a2efef857107e3ef6038b4505e21712..240384d19c5eeebb86ac92be53d22605b1862801 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -31,7 +31,40 @@ using device_gemm_dl_i8_i8_i8_km_nk_mn_instances = std::tuple<
         // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
         // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
         // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>
+        // MPerBlock=128, NPerBlock=128       
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,      S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..350834f7e515b93645e2f22285c19f9b9285304c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instances = std::tuple<
+    // clang-format off
+        // #########|  AData|   BData|   CData|    AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        // MPerBlock=128, NPerBlock=128       
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<4, 4>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          2,          4,      1,       S<4, 2>,       S<2, 2>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,      S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<4, 2>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,      S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,      S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
index b6d72fa221f4b946f3a25363a99b737a8a830428..e96905247d6a72e057bf13d821fdc17f78a380cf 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -31,7 +31,40 @@ using device_gemm_dl_i8_i8_i8_mk_kn_mn_instances = std::tuple<
         // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
         // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                |                   |
         // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                |                   |
-        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..27397527bc973c56bf15a8c35e86dec89184959c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instances = std::tuple<
+    // clang-format off
+        // #########|  AData|   BData|   CData|    AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                |                   |
+        // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<4, 4>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          2,          4,      1,       S<4, 2>,       S<2, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl< int8_t, int8_t, int8_t,      int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
index 67d2e3ce4b41c097fffb3099b97a8d398b44b743..124b818b28cfe1145a5936a9ea77500b980df0ad 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -31,7 +31,40 @@ using device_gemm_dl_i8_i8_i8_mk_nk_mn_instances = std::tuple<
         // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
         // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
         // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b99f3f2b63902c4b832c86dcdc3f74376d972dbe
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances = std::tuple<
+    // clang-format off
+        // #########|  AData|   BData|   CData|    AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<4, 4>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          2,          4,      1,       S<4, 2>,       S<2, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<  int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
index 03eebf4ec3062cf5f85486288673fb96fb2761b1..2e884dfc8afba8d6c6ea88a8e3159c7a62660364 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
index 5d8de04cd9fa66931dde4ed0494fc429659506c3..2ca29b1e6f0791154d16ec1b5dd134d8d9c77c37 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp
index 7b12b7cf1f42da6d8c321c1ac121166753b7ff59..706076098da912ea9e5a844ccb2797cbc850a01b 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
index 730ffd4633de3c72cb6ab932126a18d8798418dd..5ac458a7b985d16faf90a53866ec4f9d2bc9c6b0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
index 619473ff0ce40a5150c59efdfd62bc905d7ce83f..a64412544721237542c729d0f9581acb39917555 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
index 8e06f9d26b4d3cf8a22259a54e103c7b8ce76190..44b684823391fd83caaf4b55a1ef41c19fe5b3c1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
index f9458b7483c5c29011b4aa063061436da23f9e36..23176269c2c84808ad05a874259694f22c6064d1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
index 77a03b746cc12e5f8cea8772c4432c9fb4e24110..31a9abe53a2312c25ebeb5157e81998054d2345b 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
index ef8d7d4e40ef4de46857db4932e109780e52f044..201fd93110b604b1c6df849fcc5c88ff98a93bea 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp
index cb65cc7b68b359ec015af3adcd3a7866983d492b..5d489b207efdf62de64d99469acfdbe49a13859f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp
index 5b1014ed827d08f275b5da4c20abe14f153b9950..e09480d57bc20e54697fcd3c5ae761a66fda678b 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp
index e6f6add8bf45e98f6e9607b48a924e00ef642323..34065c334d1bee6064718a8dbac6f0b6f51c5826 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
index 80b3d03da26c49136c8220f8a433d8798ad751e1..95d7777a79371c4d9dfa1ec6ff09b16dc55a3807 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
index 93b3df1e572604e70d059c6cb622f8143eb2a92a..bf24bc76b1b8e3c64473df1d6b985712c53f4a20 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
index f10365d892455e3e36f5e05ec94b7a4f577c1fbc..023f987127cc61983fb11f53838cff9383f0f257 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
index a7a9eb62cef20c05cee9ff76e5d1c0b21fb95ab9..ffb199e58fe3a0200197ef1d119feec348a01502 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
index 9fb45b00365896be88e6aff214922c51325a4d65..90e979d891a5c4d1d9ca18653311afeda9029e5b 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
index 18a78674e7ac4d79e6da3b035c8e9ea966618f16..8a81b77891fb2bb92175ef0d2e0e06f3fc38fc82 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
index cef6070af8a72a6cdb0616574736e722094f3a74..e1983add0459de966453bd86fb3d25e2fc68af2d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
index 1be70d6ca414f785ec2582fd90bc5d3d2617e4e3..47a180e1276b9290cd4a7506ab1a3df2917b2e6e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
index 6b8455ffa93fcceaf9d0e5d5595197f8bafddcbe..b8e994e91acc5f73469c257c4dd8d99bec193d97 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
index b9e28e3d7ef70b2e3aa785b048280fb4959ce312..a590413accba4e9e0dddd8df2952cd9d3fdb8b35 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
index 2b1a5a57bb8b22b9235fdd02fe3f9da659b13c0d..1d010d1b0792dfc62f0a1de235558ab8e4e71fd0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
index 301d3b55b59c0a6818b27570fe1c9a7b9ff44946..f108b753421598c376eefb8cfa52b0a4f1522920 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
index cd16f35ff233fb04c79d50888f54b806e79e611a..b0b4bc012df347cf3e487d96ecef2e03826f5e77 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
index 39166698473dbd05010f960e82366012ffaa1863..df3bd94fcafa5062b597584c3aadf2772c376f39 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
index 0a623034ef65a1cfd0d220b9ad2253ab0b298620..73b4e776668c969fb6ddcd38f74bd1a74fea48c8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
index 5ef8d08de90a65ae04f552fc87447e3b7887bc92..76137a1c3e1e2cb82c9e15ec9bc71e4abe76391e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
index c9557bae893e1affc0f2138aaa9cfca206e267d1..f0158d8f3d51fbb773988d02c14c329e9243ebf9 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
index 463e0865c0ae6980c534005d1ace62750645a74c..7b65f8737aa311306a0d2afeb9043b28c08685a2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
index b71ff1b9972a97276a0fedf0106814a54ba19a0c..a9fef5c60d9c09be99e44d87a15bd4be90b6ae0f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
index 9060c9b1b084512f038ccd5c5b0c3573ac32f0a9..c3b1dfccab6b23d46044e8cf4259e8059e062e52 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
index 81cf01d6a9d70346e8ea2387220fb03258e6e580..8338b34a4d4f2b76eed2eb77f6f2d47724134185 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
index 4da85cc46eb4ded15bcd63dd7b0d89864637d2b0..357dc91aa177d4f714245afc21e827a9705ecce4 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
index ab83e4baabf80242bfbf569fb13b5d39a3686ad7..65b94d0879f9ad326ff402a491c1b515ddf0f0ca 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
index a4cd3fadbe93f4231388e9efccbe28a2c69a6f1b..00cd07bab868b21fd72480fa2848ae9e84a71164 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
index 207e76ffe5f18f9a4d64a0083c27829933e7a5fa..9955a206b6f4eba7e743b786b2923b080d06fdcb 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
index 3f30937ff5b3b7235efc93dc6287a494ed783dee..d9ae7817914dbcb7dcae38254afef6f02e54a4da 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
index d91e6c63bea06d70513106cf7eac171b892498b9..27c6cbe8ab71398f9c3bc03bb977c71af397836d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
index 143321542353e4d84d7275f375d9fd49103bc8cb..ff5f0e94a1732e3b36bc9295c4f1ce9a24fa467f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
index 09acc7c0f75bcd76412755a590605833c2bc3b5e..dba625e0ada1d4f09cb26a5860849a707a35cb97 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp
index 47b8d23424d1dd59099441d3da8d80c1b889daf2..28a452c1a1124a326337cf4c21183035294fda1d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp
index efa030ec4952acabef2a5fbd879adc07eeb2ecbf..13366238d6d9008d7ed425485670df5229f71cee 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp
index f2735020e65bdbeda2c31df3c451d16903a0e320..8a4889ee83eabcae6466bdd5dd67d268c9d54287 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp
index 7d4aae928b31d5c086ea717ad4be492a577b8cb1..fc3cbcf9055892bfcf63dcf6d3d3c4515026ca71 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
index e8747af4828a2492f6db8e23b575de4efb7ce90e..bfb95bce85cb8696e645ec6b5626abf84456916f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
index ed54c3a9bf58346a8829f3d7f275c6668e00ed1d..d0352339cf38fecf9c3b3d4fd8a91729ed53495f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
index da7eae637bfb16580b4ec98e479768c2649adad7..d5b298ab217b4269beca858f0d54e319bf6dbf4f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
index 34345095e0c2489f44437ea58f58b2891cc91ff1..80c8f018f416154d0b2ccd26cb9f8822233a43e4 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
index 55461dfba783de45a2991836535b3b652cc49a44..74ec9e1f8d28a77b0f78181cd6c05ea93b140e16 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
index 405e69975ca9ffec97999bce38fd73d953e139ed..eb98b3e7e871a1c5f4129032031175443ddc4de0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
index 9af31b3a129d2d1bf1ff6f32d0ee8e252cbdfa29..5f4a90125ac827ccfa406b5bf81f11313ce54258 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
index 841b7a1d47ce48b8486f0af04ff80571e4601d1a..38e3897d6a552d8a2f797b9f3b4baaa98636a53d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
index 9f7f643beb6832594bdc393777e5e08663a1f94e..c5aa59f911d10896c35e13855bd941b0f6d0564b 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
index c8e9f35d240da45fb902a2d02cd73d2eab5167e1..e71b269b8dcceef3e41fc67228c8d482a2d00bd7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
index 5f804d45a560ac3b7b519cb3ab1840214e392d39..fdf63f811210a63b4fd4dcb669cce0fc0a0980d7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
index 60cb138f5654bb6f2b10ea5ba0a03191a49906ab..4c98a9d5e929e77ee74050645154db9bfd8e1811 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
index 59e2b2da8641b9fb094750077d760e9f611d9c98..f0e7b6ab43c51abeb4e79e9e30b104bf05f82a07 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
index bb09bf8b8e82f6c2056881f37b9e65c73657f0e5..56815b9ac5fa4d89b649089c73fbfeddc985cc8e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
index 0a3b566de64505adb19f75ffe5e126cfed5da438..e66d46a26995ec521df8fe15a7fd01277ebaff0d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
index 2b17e47b1cce153654d74ca7497b5c04117071de..fb1dfac69cd525ee7b0caa3d3d7b732ba5042bd8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp
index e178d3b0adeb563c36d6a8f1f3936d048df23f75..fed2cbbfb980ab27e8810b643874fc9b308b21b7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp
index 52be9fe709bd710f3e05f381e342deef5c7affae..44ac4c08cd97881f98f0c36a9bd5d42cd5078327 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
index c4680db831790d3e86f676bd14a07fc266deb9f6..30a2bf36d1cef771764e2832fc8d8a91b7120588 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
index 7fc35c4198eb49bf2625f6cfa8eb80a001fdc086..4b2a2dbdc73aee32a427f91470610325a5bb3080 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp
index f27b2199e0c9dcb1e91566cb0aa0a05edaef5b0e..9d15ccd362774c77a9613046995d44761b8ccb49 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp
index b9a1095570aaf33aa8f3f93710cbb6d3ae276430..4e9ad58742c552f437dd4b8174cb675984dfc681 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp
index 44e5f597d0d714438c97a9ce7d591736f4fe7f48..330e5aff907af00920dcc4f3cb1f5f2a3b50e6fa 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp
index f3a9063f7fb847b9c303ea836f97ed4cf3173ab9..0db3a15d2a4816cca200820346ccf7c9a711fa31 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
index 05ba449246e8c6f68182eb707a3e9da0974e2078..ccbfaeaf4f3d0c1d6b3f25adf8765466195af971 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
index 7a610a747cc9cc7555bd18b17fd499d19cd7c439..e10de67f94bf4747915552c54ff47874b53eb721 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
index 90e074f052c6f935bcbf862250670289e0634617..8b47e82f6d3574fd04c5ed69d82dd4b6f2584779 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
index 74aebf1031d5b02db54fece32fa693e0c532287d..5aa50adb31ecf5c8c636d0bb34946b166c5919e6 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
index 361ea8f4ee9ff8144fd2392754fdaa53f380e40b..333b40c71ce1c755f7d7862c9ddae8013392ea8a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
index 3145b716402e4e5cf8e542dc991bc7db0a655e74..506a93ae9ba07d7a53e770809f2c47399432e476 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp
index cde93f902c95d1c28f85b7a7627089fe3e97f841..30084f16d1d4c86ee2778b0d3526c2e593d39259 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index 3d604d42cc3a9fcbf8e9750ab5e3b2064f0bc161..11babea28e23a671548af1ab469c65041a845502 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
index ede21f1f4f7124156540518bf4252d3526929606..c2c0fc553cb38d299a1fbbc9990a53817c7105cd 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index 99e556618c3bfc535807ae8595a33e0f6b6d5390..5be7443eca6ec8c7acd76beeb2c790b711810b89 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index 15871a28c3a2a85c053fc3259f9de67b5694ede6..2828b432ff6772d802d850bbcc17ee58a3af909c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_common.hpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_common.hpp
index b4de825fb626067c49be31f5f6e2f01a459b4cad..85a7a5be257365f37234c3b723d47a86a253b24d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index f7e575df2b4cea0a6cc5774f5471ba9120777000..5eb881549cf37c7aba2dae0e5ac1aa3402a9c3a9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_dl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index 85300b4e44e54df0317fd305060d6a0ac6adb33b..4157853c41767f8ef705c864482fa56724d16e19 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_dl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_instance.hpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_instance.hpp
index bcda22006b3ee788f8db1dd2cf427e6d510d425c..3d3f9b17930e051638f7e017bf7b4250fee19eac 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
 #include "device_grouped_conv2d_fwd_common.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
index 40593a0efba9f68e96d3760950ab9cf874b6b5cb..bc7d577b69b623901b71e4127ee67d55676947f2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index 7088028bf54ab66f3873526e098d0e16b92a30d1..55bf6da9b9e1b876bce918ee8d611c69a053d820 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index 919274c5033aa1e993e4bd0c094dd9a48cc2438c..202cdd6b446b20277eccedcd89bbf2346b906241 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_instance.hpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_instance.hpp
index 2858671ee90005cd807c7f8dbb42e3823f04cb54..07bea1c03cafca9e7cd3069c1fb04c0e1d6cf9f4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
 #include "device_grouped_conv2d_fwd_common.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index 25caf61df1349638f8f089acb69d2221a82be7fa..82edf896db1c932c2d3b19d9361e1294801b4035 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index b997cfb6723c2699136b2eea0d90f1afce22ee6f..4bd3236caf31392c6626b0df299d02082f55c6e1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
index 3256a2a8268cf02bd4f8f2ec2d71bfa77c74ff21..4f5bdb202349ad0a5c10bbde97caefda0eca61f2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "device_grouped_conv2d_fwd_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
index e48db4a5314528bb45f4444f93da4ba8872e86d8..7ae87eed5d2504efa3c24b8c669c36ba19b7bc79 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
index 1655850ec148568734cbdbace8eefe8cb6487e0c..ab07341b59b3c9a4bacfcc842cf8ab2e8b09372b 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
index aba46b7ebeb90afb8881fc7ad4d8a2e1281eb146..15045bedd87a1d76eb2804e7e95dfac4db3e12ff 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
index b4ae8b6ce5f2ba3d8f3569449934d84fc631252c..c3b100ea69c8312dd72a724d8b9c4e242efa86df 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
index 061674bd829840def99f4dcf317886844d398fea..ca488e9dc429e7cbb943979ce79f9f2dda157c48 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
index ed7e5476760983e8ca5a17475585ec5eb9e834f2..6087b1b187cd4af0d45402379dc391479ea936c5 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp
index bf5fa306013e485e6dd57faa011bee8217001148..fd8c47deb38ab31641622317c28c9d7303c85392 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp
index 8c384937352a012a701a21a3bb572d6b6fb7e57c..5e5dbc53cd1738005efb26d8d3852a1c99a26057 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instance.cpp
index 487cd22721a086bbeca32dd56639614903596905..a88fe4af6335f70b60a0f8ef044adca7adbc5356 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instance.cpp
index d497cd57edfd85eda86d9641f49fa219b45e72d5..f6e7e5b28ea54f0749185bf3ef04314bf8c97120 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp
index 2e53fbbda5c5e9d4f2712b17fce2f57f7d140842..3d303a3fa7bd5e2d7cf2fd2e547c76e42631eeaf 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
index b550bb28716e14d6618dd06ed06802211c92c866..aa161e51c989b2128a9a1e1c5ad055d2678ae737 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
index a3f9c7a9e73f6f6db24a6526bf5e8195842a33ac..c454deac1a3a045f51a853d1546a70500360cc15 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
index a93cb7fc84e9908613cc9e180198f60800524ec9..c829e8863d2c1156c1754ee3ae73c47a5f84a07a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -64,6 +64,7 @@ using device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_irregular_tile_instances = st
         //###################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //###################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //###################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
index 2ace1b24320b215fc5d45f21aec69ddcbc346cff..fb30e7a97317aa74d589d0987011b54dc67dc62b 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
index 764ec061981ecc814a5c2a88fa01c98be17f0720..8642562fa38f25a7e487d7b4194ee8f8f16beb91 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
index 0385b0fc0cd699e6ded2f506f5a31959fe597fa1..83b31b07cf45c068be8711838ff43ab2ad0101a6 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -44,14 +44,14 @@ using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_tile_instanc
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-//      DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-//      DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
-//      DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
index cffb0fce17728ae09075047bc5f1e571b7381fb2..aa6365cd98c039c36007b875d425e3cef37ba782 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
index 5933ff61eca76a4b8b5bddf1961b837695c6cd5a..f4460b360be563123fdf7e02c43b49008dfc43ea 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -37,7 +37,7 @@ using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instanc
         //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-//      DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
@@ -45,7 +45,7 @@ using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instanc
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,    
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
-//      DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   256,    32,   8,   8,   32,   32,    1,    4,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp
index c2f5f00c7acb62da4a901bc965d237aeb46bcf3a..f4086b6eac4dd74903fbddb955d5bd7f41ee748a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp
index 476d4ce1f87a0d2d5c109ba847273edbb31cd8e1..d68eb76144579b073f8119d431910696e6b4af9e 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
index 1023fa48102b7078b320be22beb19881cb678cba..2dfb8caace5144fa29934af425b005c28e9d77c1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
index 6b065c0f82ac87adb5acbbf21cd470392890bd6d..598a0b0e29c22effb655b09247bc84ef6c1dd268 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
index e9c2112e16e79513d246afe635011bd8620db0b4..be860f58e068c83d45d59f0ecdf87faa2827088a 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
index 79dde38fc90d370c8a483a0187a1e95f3abd9d01..9a64e555d650efdf9a81bfa7ca9f47ef0c1dfd90 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
index 9f6bf128faebd85190b93c2d8e6a08ab1c9f1baa..fe72a27331d94eb2db662241687ba136e3b49ac5 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
index 6241e0338561fca8b6e286cec098c8edefcf6424..cac8641e1351b6efd291cc634eb849f321859921 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
index b64328d5d07518e9ff4f259d35edb11811239e63..0a9ac846235f7d0c7b76ed15b5c5481f57de40b8 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
index d6a2f6f2c1c9c86d37179bf6c7a00cdcf6e359bc..ad92818ec2f0e32ba73c2e9b64a06eb61fd2786d 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
index 73097828e3b19017453f197d8eb4ae152cb10fa9..70e3bbc1c1d62aef89f0aafe12384ebeaafa9029 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
index 507a683ee7a8a435604dba0a5c56969ba84d9a1b..7c5d2c4a9c1eac226adb399cbf1b4d035cb3dfde 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
index ca1aa0c25cecbd4ff67df53b5e53961877faa55b..f5626d4a9a36ffb182efb666092845c8fb9eb97a 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "normalization_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
index 9dea41e89d3ca119450905d222e95c77f95f0afe..d9029ac25e8487bdab16cc75d5f7dcec8db10b90 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
index 38338ff9976cc28730bc36eee225529f31da949a..508ad3873b65e0cf7109adb2ffc1f7d7666149eb 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
index 0f4a35deefcb40a1ef4cdfd90bb5e3f95cc412f6..ada96a93a2dfe1c71e83d3f14fdf2bb4b4647da0 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
index 6fcb519a9b822fd0cdcc65e14ec4ecbb9d3f269a..62bcad992ae1b8d77bfea25f673191979e1aeab8 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
index 67ffd470858175089475e459e869bb7cc6de9b30..47896be911571933136b04e726d7273eb635254c 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
index a41cd0094bd90b5fe3ac9982b1cdd34481eec740..35c8522d9f9aedb64c15338a3e8389102534262b 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
index fa70569bab495090dd6290657727a6bd30162b81..75b7629f246c70c99bc8f021bc299142609bbd3d 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
index f3367b946098874d8533785c56c77bcfdcfd900b..dbfc4acfdfbed1eb1c22a410e4d54edc915c39fe 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
index 8477a884da7fda20301ab88f3bd3b4414f8dffeb..63b3e8df8ea3254ee7f5fc0cb4be2270a144a72f 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "pool_fwd_instance_common.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp b/library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp
index cd508b55b0cfdc639ca7db204579ce2c3e7321df..8aa707885b2b785974ecef5491cae9ac7ec391d3 100644
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
index 672cdba65d31799eee77622c12da7e405446d6ad..711314985a7cd2ba708138df597e37d1e874709f 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
index d4b5484d8b05a9a9c7546f9eb0eb1eeea9414442..39c4f82fefd4beb2a0dc3629f0757f7f01037495 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_dl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
index 7db4b8d8695d655dddad18fe7b63d89c5d6b064b..92e73eb2ee45895c4277eb46f2a5ec94116fc1e2 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_dl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp
index 7eefbe0381c19cba139399cff386a4841768e64c..bb7a570cda26fbe65e4228a1fa1d9b9fbd4d896c 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
index c8f5f7042c92c5aaea7ab91cb27c7d2b243a2698..1d8b58fd18a49b777158c266459925a8ef67079b 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_dl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
index d7f7384ff78262cf21680ddc1c03238194540499..62826d0d2b3bb541cb70ed1fc7f70f6c55a431c1 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_dl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
index 658aa8370a43aa27d9ca60bcdf303ede664fbb51..99877c32dba05ea6386bb2f94d888e7fe6d95279 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_xdl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
index 7102e9b259cabb4b45783d915ac43a4f5a34f92b..50ccc69f458fc9a46bbe5e08510e3da1174a90cc 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_xdl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp
index 90f8791aa94ca22a0be19f0792cc7c6879a6ce5f..caced6c950a7872714b76aadee100e3a524a03d8 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
index 9d6937708550a2d7b5cf27a1a4c822366b01d6a8..526fe73463cba7ac977b597c2279488aaa9afe6b 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_xdl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
index d6f87335b2d3c21996be54adbffe39b36e0a3e97..d1d5a66bb949b391909426aa13ac4bc4d9e91be6 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_conv2d_xdl_int8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp
index 9cad8d4c8e8c14b2fd15a802a102c4de53f5afef..0ec498a441d26c37cb5e449213f9b1048c36c9c5 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_quantization_common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
index ffe1efb80b5721230bcb059412a18d1c4464f3af..4c02f8179caf2a7bf5ee6a46111f0b9d610e7667 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
index 7f24e5677c482ac881ddebd8d0aeab9d1aca0b39..2c635e96d228fb4283fe518dbf010c66fc9a4272 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
index 06e66cfe037696ff9f3198bf5308579a3e18fea8..c5a52f2cf25137fbbfac20d6451db1c67c1ea660 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
index 16635d1e93de1b80933dddbc38017b2716b56132..fe4a0b4d68b4ed659aef0e6ab23d4a443cdfde72 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp
index dfb8dc29b49c8501671e3b17c00e7829c9941deb..d1b18ac4929bb114ab870a735b6b182e7a4e1343 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_quantization_common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
index c153cdf9ea9e2f02bcc28b1c50f707cc683e0211..e3163c32ed1cb9704e00f5d46e82cfacb187aa60 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
index f6cd32026f92ead80af192fb90c2ce794c26c398..f6a0163aba346bc731f3241f0426306e5eeb43f3 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
index 45fbacc334d0974644641e5d43edde90f7aa9c26..0cfffbed70fdc95b6161c06a634bf379cc7c2651 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
index 257633fe19a1360d44b8fa9d768918e106210909..289898257b76ab48c44e06259dd067096f8489cd 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
 
diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp b/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp
index 213f42b91b6365bdc5e346c060ca33a8c2ebb71a..e7c2500fef2fac9e90639301c60592dcb8978ccd 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp
index cf46059a0d2cb9f397209124f15f808bc51fd165..4771b4aa5de2ccfb5b7f9a6205f64594d37bdabd 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp
index 0043b198440c2a1da45b10cd81197b493d51b077..f684dabc1fcd60bd6522a9865f18ac39dcdc3d0e 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp
index 6f702ddf1ffee3a684dd66c124a3a2d7ef1f8654..86e2ca7c47cb8f2a318c745b19e0fcec058c7318 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp
index d1f70dc99e87606c0a388b57a181aa12c6ff26a5..97d09ff834fc46a65b8f25182238349f70bce818 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp
index a957981a55233f7f2c9e5039bfb4705cf2322850..87187f493acf8b8247e8877b369edfc106a6aaa1 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp
index 550a9cd76c6db474de2b78f323a5b8feed936bd7..d3f5cd8656940f1adad49d0fd38725a5e04167ea 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp
index 58cb6ee34853d2670ee89b2fde99e42cb3160819..fb4a50d5e3539c3c2db9e799ea2c824a59c90c7c 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp
index 1ac5e79bc16cfa587ae05072a72c8b43de08ba2a..f74d22f7acadb2ab97a665adfb6eb079ed34cb64 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp
index b1e1a06800ec06e1fa0ab388ae63ca20d29bbe0a..0982455add502da7eb5f9365f4b128673846db82 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp
index 1a15b32d23fbe533322249132a8b759745d6362c..c6f1c3a6338115a42efc02311228af8b488c4a8d 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp
index 119f384b4ebcc8b446d0ace7c8bd65a86528e13f..db9f52a13fe436252d57644b9d76db41170b042b 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp
index 3f1bd86b8b8f688b3e341c24535d45d99a0bc941..1410f51bea100f3aa56b369d8442d3c81fc79ebe 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp
index b507f0d1f90621800e73a5d23fbbfa88f1443684..707a655703503001118da5d2e28e53f24ecc135c 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp
index 04d0ea2e8b86c78d0348e0e0878a6a21ccbf1004..17f45c3327c63296291aef9132461f5bb56074b2 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp
index 3de561f2b615cc1ac18962eabacf583afb8c11aa..d3753d01c4e65dc2344ff4d8199bd81d674f5c96 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp
index 3f45b03134a85322874019f7e7dcdd102a4d5013..83fad7826a070ef91abcc33c68b1230b410fb8f5 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp
index 76851d9b726c4f75ec68e2206d7fc0d92c91e38e..c627f686363a9a12e251797b45060d56d2947d33 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp
index 9cef019320b8211a9e1228e322517eb0f28e65fd..09686d98ffcacb07f5f9244c196daeec993780dc 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp
index ce73ec47e3c2078e9c7629107339e1db17c4934f..44519032f31cba47f9606aee66dce5b989685931 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp
index ed6091f92488263f3e3bbaa1804102f674cba288..9e14d80e21ae1956595eca87ab6ab859c42c52f6 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp
index 4c8375de169b3f94eb30696162248fda78b375bb..ab217f1160481e0155cc3f0e6e62dc21f1f70fe2 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp
index 0fa93ab68879f310bac2698a241a23017e4b4064..3f611da7e0da0d7f61b3af38f18ac29b8dc5b2a0 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp
index 821eec1751a8b7c3e11884e53749da844d2a3fec..8695db4004081ee0197eebd623cf539286236d3e 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp
index 0305b4945f14f91398492aa53cf0795043a19c6c..8d15fda7869d85daa457ff5cfb69bfcbb719cf9b 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp
index 1bda0bcc71ad37daab5a1e36a188fffd5c10fb7a..c161858b1f161694485d5531485995490949112d 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp
index 7f8018a04eb1b7cb9fed18b18b61c6dc1dd586f3..e6b87fdead361de49fbb28e5a709b2d5fea812b2 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp
index 887a89cc2babe127e32a8f6c7b6b42b2645ed614..c2e4acc8d9338824df7811d2434474ceddf5ec6d 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp
index 0cc810363d4ca4191add9898f645327d27520203..e5a5e07e176b68ed8eca614a52a432efff1ca9d7 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp
index 4c825a9f1baa5c84ab746777c0f4b7839b9b6c86..22c62033454a9525f87fc7b19f8ab0f6186a3cda 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp
index bf26913fd31c54d2d3d5e4459ed30179be3f17e0..1890fb78cd7c7fd387f6f9d3290009b54ca3ba96 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp
index 629299c7b112268ab166bacc0350aa53646393ad..dc5caf297bc971950c2424496ec1c2a2fe2c2a91 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp
index 9a086344985249eec8ca08a7db6a3d6a6859a033..02bb0c3b1d9d5e8db84a0338718c66a4cc7e66c9 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp
index 6dc925bd6f3f790de62b56598f6d813ee4920f7b..0b0a40303805ccfb4d9ea93e4b1295cc7b2edb6e 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp
index 470d68d37246c5f5b549068df36c1b29f5b42e74..fcf95aab7a37aa9c79ef6e22717afebb24052406 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp
index 39303ab58002eb1a538a51a621822a1f715c5a9a..5151d32a58e9c321985faf6f8cb7508e5cc4ad5f 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp
index a5481784ed675cc36cf6ab8d0ec64d0961934030..5463cc2b4c2e5762f08db5593bfc51e204a41713 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp
index aa6e6d3cc976981103d8cb226fe100c5161bee59..a15e99fe0f57a2aab28811660dd910645cef482a 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp
index b1ea551eaa775b867114776e171687ff1ee8ea27..ee7a4dd4cef85adf56094750c09558b3dea1ec5a 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp
index 2ba83132d8f42cfb4695c93a602112cad34acecc..58f7295a835ac90d6bc13125f2dc290544497bc0 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp
index b9018e7c6af6e93f00825b0e72b0d3338a788960..43f2370a2c37cbe181ac99b81c51313f1b89d3b3 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp
index 1abc6de5526d5bd3bb89d95c7e58fb54bc76ef20..eaf84a6de1b7ba55446a2994ff5cdf9eb939e625 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp
index f3a017aeb422e08573c9d1fbcb58f763baa14b09..76d63c9a96372e9c3cd97de281d4e841d92ee71a 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp
index 329617bb432d742eab9b3021ebdfc3f5c0812d3d..eb1d389c398fe609b60d16c9202ef2f117cd43ec 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp
index 1e4d43debd2992fee6add61f9ea5f339f7cc4db9..61225df05dd1d0e444c76fdf3c0a318b80d17380 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp
index f9f79675f59711b6dee3aac4f8a3cc1d0a4eaad8..aef231e17ac12f2b6380213ce810831da52c7ad1 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp
index d3e7268c5a53106c85ec3dc19400191874def55d..6d2939e3e0e3ab7e196c390b4758bd0ab0345ba2 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp
index a41a12386d021c952d1c014229b1de233290539c..ff0ddd773b75603ca245bd5ca3821d5443db00d9 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp
index 6da1acc4e54de8fc86fd60ce449bc759f8cfce98..1367022065226226457941e729b71c0d1b66b387 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp
index f14b8a40312d265e231ded1d4dd4b5bc47fcc8ea..f0f93019189aa751fbc39dcafed73cc42fd0f85c 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp
index 5a9f08167c9a027058edc7a513781e26804513c7..db9f1566008d81c0c5d5e09f2b73c13d7c77fdad 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp
index d3aff06753a1c689fd6362fafdb4b4ca9a3d940f..e5644faf291740876ee4daeaf66fc0e43e20b0ce 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp
index 55f7537d8334586c4aa6ff52dc8e20cb63650efd..350259f5f0c1ebc258bd202f182e54d600b619cd 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp
index 70f91168d8bc3025758f64186a1a379b89ce3b8e..89cae5527342997a0db882b1c438a8dc117e972f 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp
index 47f5e67fe080d8c6ef3bb29b25ac4ed74a80d4ab..b0f82ab45350b5bb71e15f609dc0e01264df8728 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp
index eae489ff04d08ccb453ce1088f7140dafaecbd89..fedd0f890bd8987a1bf1b9b7f29d99c5293676f2 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp
index 9fb267a201724bb925884efe625160817765257f..64ec1ce1574a3608be4db2a88b02d26c1f1c0835 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp
index fecb2691f8679cf97f489ffb09619c8bd092c16f..156f182f55a1e515e337b7a3cd20f124f618fcec 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp
index 232d2b858bcf651001fb4195b981d1e0c0f6c9ec..22b278790d970dbb4cb68741f653606cbd922abb 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp
index 07d45c4ca994aae5384732901783cb69c16c2c4c..f3070d297587f8510ee21b553d7299a9cd30d745 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp
index 596a062f3a227b71dcb1ed15122f31b0017e584b..e17476ba75111134f67457df836415323c93a2ce 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp
index 7270cefe8b4a8ae83c374893a5d3bf57070afd86..2387725b9f74b543e1db6ca3de372838a9badaea 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp
index d0f4ef3dff503c92a02713b4832776f6a182ea2a..8be94118c4864ec38087204d87f1fbb83b85ec43 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp
index 9c6bce92f45de792a722a32630d023ed6ea177a1..9ff01615c9c83a51ac995cd5a7a6eb840c6fafa5 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp
index 5faf8d8283147fc9e56f74f375761a8f6b5022df..8a99c7c95879554615a61e46a998ef213c9a5cdb 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
index 8f3c72451a3e05bf75398c6586fa019b2572f698..78909c8dbf5f35e0241a1802c4e589085b4e7ead 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
index 8d7794f42e3b4d52d510bd14bef0186e3c357376..7f094c28b756e020a47b1a78454f839ac9fb540e 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp
index 4a32543a136d435d1a1863aebb238904d0e34c1c..4dfb477c1e74f4a3863df56fddbe815ea5d8a321 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp
index 26d571c84e5295fdcd6e5dbfb7827889c75efc31..e4928336069cc19a6d161d035407ed8a63deefaf 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
index ae56a2a91999c686cb56d358311bcd8bc320d376..de8dce357bddc3470d8d17142b11f0cac7c6a507 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp
index aae3233c9f0be15daa3c9004881f1a7afc85c89c..0225e75cb09a770c94597dd0caffee13f05b1b27 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
index 94d5d3fa2f45f31647779ea451ba1ac4b833c9c8..6d04d2128f651c45b9abd4c2265c89c60495d45d 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
index dad190a63484a50121c71096547264659115219c..046643abe1a6cb951204d3e624f89f678be71176 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
index b7ca6998f5e47d732b5998c52e85d5b9dcef9fd4..a61302c001f7ecbcad117f5f8df38cc9ec9c3c14 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
index 22c40187ea9be8f1163693908c60a39a50f13d45..7619e12a8e6bfe37e09abccb2f2672b9458cc934 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
index 14d2764529c826587b13661e7a7cab11e8d6ea99..a86da7cc795ac3ead069dbcc0f57862eafb89913 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
index fa334b997c27af20f05c42335bde28f5f32f819a..938fb033a11d4a5d1eb34829ffee265366128069 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
index 1c9d37d8483b7f249a0b1c05551c70acdb46db03..3d56593811f99335d0c017b632d935b7a7eab829 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
index 5fbdab5055edb0f124bbc6541fcc705a12b71d63..d701b4174e05818bf0cacd11cde6b5377ff4a4dc 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
index 7dd8640b187a2b793d5ad6931ab329ae024ca432..2085aafc56f8f518f9cd7c8b96c6bc81e2f86b20 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
index b32fe6838f83440e98f524c59af9a383760dfe7d..ebe4329f9aa13c165e9e7b1ee90fd0755a375f76 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
index c05048ec567bd6471fca14c41d062eaf8c0c86a5..b8fd5a1e5fc6e1368e6c647562e6d3a0d98e6b41 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
index 6a235708bd426c7270f16246fa058eae45cdd056..112f1940d38c3fa77f414c16f98866183bbff63b 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
index e5bec5e2639d234b3944f679e99e988f20b2c383..ab8a69eec21a81e4d2f3dec6a92df73e96cdfbe9 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
index 57d3f184a6635f4febe632054c9e39798ef0cc82..5382fec90d2604bb7e091fd4fcc4db9bce4dace9 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
index fae3a4dd6662d8920b1b88c8489e9a6f00f28ed0..a1a143afa18f818a63e2dc50449b6fc4a6a2b576 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
index b6fb70e8e2a6aa097f2a76865f690812dffaffdd..992e0c1ec185f858c3a15db0b7746741ea6734ec 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
index 33c7b6f35f351f755a50f48f92d391fd47f40163..2be1f45bb172f3b9781f51a1097beb69092f7230 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
index c22aa574b1f984d5a4f8c3276c9fb11b74002680..a1da73aa8b93f58302fd1a7965d0e89f40669b46 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
index 55f3d2bd207a0a160f00415592f0106f41be5cee..b5c3b576a66ad781305a703f4b5dde3d4a275d62 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
index fb0bcf5ee8a2dd6fd5ed4f77c0d8f4be5c5fcacd..22a0404c055c71dd750aa27fda5d1e616232f93d 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
index 608cfcf8380be29e24d5e4b3aadd7573285a5224..81a2ff80ca63be8a6cdc2093c349461d0202237b 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
index 15552dbae5d501c506f4fd14b439a80664a95b66..3e2cf8d06275d43d1cfe58f3cb4c11804783cbec 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
index 67674028860b471888bc87a59e1b7ae751c64eec..c8b038d5023df6c6f596055735232e7655b2bff4 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
index 4b33da93c2e1330485a55f8ede6654ab2a03fbb1..08995d99ec01ebb4728526a2f354bdc613f00f53 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
index fe3b823e889267f69001e15066bbef201afa9813..652601ee7c643ca145917f1af7ca3b203df60445 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
index 8ecdf87d9fec061094f83c3beba01497dcb8e5b8..86caac1b62a467049fce078560eafdf31fe676fd 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
index 3563135204085ff8e44d1378f9c4d9ffd99e7b68..c46ae1a4ef29fbc329bbaddf7260712d257e25db 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
index aa21a0bf8a863f433665e01e8229bd179948e1db..394814ff53d85a38049b7dcd4ef4ed65ce31090f 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 
diff --git a/library/src/utility/convolution_parameter.cpp b/library/src/utility/convolution_parameter.cpp
index c8712d20939d5f9f1b7b981645e77420c7a5c607..57cedd60199e1c3aa291d44c404d56e89f444e9a 100644
--- a/library/src/utility/convolution_parameter.cpp
+++ b/library/src/utility/convolution_parameter.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/host_utility/io.hpp"
 
diff --git a/library/src/utility/device_memory.cpp b/library/src/utility/device_memory.cpp
index 90f943313b0961bb96d6394e855809c62d050559..11166783e8e542f565bfc1171ae40d879a005268 100644
--- a/library/src/utility/device_memory.cpp
+++ b/library/src/utility/device_memory.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/host_utility/hip_check_error.hpp"
 
diff --git a/library/src/utility/host_tensor.cpp b/library/src/utility/host_tensor.cpp
index e34fbc8f345b8ca6dd5b34988a1839c6c59e61bd..7211552641195d68e42ff606ddc1b2c7aab7ba84 100644
--- a/library/src/utility/host_tensor.cpp
+++ b/library/src/utility/host_tensor.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cassert>
 
diff --git a/profiler/include/profiler/data_type_enum.hpp b/profiler/include/profiler/data_type_enum.hpp
index afcd6fea224f3e06f29309349bcc6ad55caa7d4f..c046c7fabb30e831fe41b31f936febef0f5bf148 100644
--- a/profiler/include/profiler/data_type_enum.hpp
+++ b/profiler/include/profiler/data_type_enum.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp b/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
index b16254279ce433d06f71ca350b53841a294a3de6..22dab31100d469938d39de0db89bf5b2257383c8 100644
--- a/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
index 799dccc0ff3a8ef253a6153dbdc64d8f7a5c26cb..5bee67c1ce9dc619f2b5208bf350a8fc01dceb06 100644
--- a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
index 1583c6db21e0959bd155946b2e81363091b4f43f..f3d2c5561756d43a4af44bbde12cf93c26f5602c 100644
--- a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batched_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_impl.hpp
index c07d7c0555490559bbf0357a5d2657bcbe5261fd..cdc94aa9a6fb710f66bf12792d7e1a6198a947b8 100644
--- a/profiler/include/profiler/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
index 45b7b77388b63c8abba43aabc50a9c0dd86eeede..901fa338d4d7aab26a70da548e2b000c98840d26 100644
--- a/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
index f5ec235141a7238157c0d2fd34f1f9c849decfc4..15a21206c5945d700069cd7a8f8a9fdff7c4e9bf 100644
--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
index 91c28f25fc5953460a00d87961b97d1587aad1ce..f2fcb0b133861b87a92e3434c8c3e22b0ef1a51c 100644
--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
index 79d8862081fd78b0706110219dc5af5016cd9122..3343b5e66e4b8666edf778abf4393dd75cbd0937 100644
--- a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
index 82fe75bf015beba4d56898f67aa2100b2d3c6ff4..2f9538b16c326a740b1e3bd0c3c150eaaadb9ebf 100644
--- a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
index ca653393452c8b4fb880fee5041a72a497c9566b..1b31a2aabf5f42a115d83270f91d87f0f74d8a4d 100644
--- a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
index 86d394daf90c79d6f9faf34254901dc8e2bd43e2..52152a90fe5eab2fd80ab156d9080c75b65d9eb0 100644
--- a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp b/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp
index 1aebef8bb2b2b34a640797ab9e5985d5cb09fbea..436fbdbd759195add8fba952235392d31b9613a0 100644
--- a/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp
+++ b/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp b/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp
index 2bac144334eefc2d39ff63e17637b50d977921ec..808c1a1c901293816bf3fb153c8f8a7d2faaaf15 100644
--- a/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp
+++ b/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_conv_fwd_impl.hpp b/profiler/include/profiler/profile_conv_fwd_impl.hpp
index 1f3ba8f00714449e51f180879f3b38d01221cefe..bc2eb257970d1b91ab7fbf1a592d18df0ef3d159 100644
--- a/profiler/include/profiler/profile_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_conv_fwd_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
index 7707e16b089ef3058dea988188f80e453fcd4e7d..1fd9c811095a823ce2746ddba63431ccdd77f20b 100644
--- a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
+++ b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp b/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp
index 3cc2ea3b92624da3db43696c181248e55a5f0256..81b8d8ddbf551f23de68beabd8900b4251c97c2a 100644
--- a/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp b/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp
index d53a6589e0f505ed3beb6a7792c9045c49ee0c68..6f6d881c1e4ef5214fb13d3b7523c2b4c979fd49 100644
--- a/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp b/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp
index 40093e774f0bcd770e3bc1434566d5d36c069f89..25871dfb2ec814fb5d0cda406a31ad8602b9dc16 100644
--- a/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp b/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp
index e1c90f0f5255de2386b252acba5684910edeebf6..4c3d0a045054841b3f88a9c85ba9eaa8fd2538b7 100644
--- a/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
index b4ec78cdf37659b6e838b10a8c235fdd6a0c9c64..c0ffea8a326c8bbc95a9b2d2e46da651089d807a 100644
--- a/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_bilinear_impl.hpp b/profiler/include/profiler/profile_gemm_bilinear_impl.hpp
index 31bae281c45b2e4e9bbdd8068fd3b084270c45ad..b540e938b5b5ab5798a6a18f86e452caf819c75e 100644
--- a/profiler/include/profiler/profile_gemm_bilinear_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_bilinear_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp b/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp
index f9a544c044f4165fbee8cab9ba10a10616f27e22..3893f8cdc7cfbf957fe28cd2b019092b59c88029 100644
--- a/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_impl.hpp b/profiler/include/profiler/profile_gemm_impl.hpp
index 9b164104b505d3d3912206b5fa1a6ee6aeb2ef48..eaab5dbcc2c8d3a6d5e1ea5d59c1c292f696d16d 100644
--- a/profiler/include/profiler/profile_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_reduce_impl.hpp
index 370121a3ccff382539ee34994a05b4b035db5512..ff801e8afd3731e62bd6edb545cce87bbddeae74 100644
--- a/profiler/include/profiler/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_reduce_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_gemm_splitk_impl.hpp b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
index 4cc62509d7400fc30b4b2853a1371e9b52adb503..6ffa31678f837a670f750dff8c6288d39cafb20a 100644
--- a/profiler/include/profiler/profile_gemm_splitk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -246,9 +246,9 @@ bool profile_gemm_splitk_impl(int do_verification,
     }
 
     std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
-              << " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_ave_time
-              << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
-              << best_op_name << std::endl;
+              << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << KBatch
+              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
+              << " GB/s, " << best_op_name << std::endl;
 
     return pass;
 }
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index 4f9aa98376c9e56058f04938f8171ac93c230e3e..dc6739773ff5a282f2eeaca06b7979f0b07e8482 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
index b201a2ed331cbf4a0e3951d8d69027f018af09f6..9fadfe96992a8d0374e784d1a95c4ac7989562bd 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_grouped_gemm_fastgelu_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_fastgelu_impl.hpp
index 87e6ae44c76e56919fb17ad232586ae6294f1b68..f05b13b7495dfefee263f51d5404dda3dbc598dc 100644
--- a/profiler/include/profiler/profile_grouped_gemm_fastgelu_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_fastgelu_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
index 23dca244dcdd80866b189c9d2474c596e9b15591..09a651d77c994e69635cb4891fa0cd55ae52791b 100644
--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -19,6 +19,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/fill.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -43,7 +44,6 @@ bool profile_grouped_gemm_impl(int do_verification,
                                const std::vector<int>& StrideCs,
                                int kbatch = 1)
 {
-
     bool pass = true;
 
     auto f_host_tensor_descriptor =
@@ -81,11 +81,11 @@ bool profile_grouped_gemm_impl(int do_verification,
 
         c_m_n_device_results.push_back(
             Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
-
+#if DEBUG_LOG
         std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i
                   << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
                   << "]:" << c_m_n_device_results[i].mDesc << std::endl;
-
+#endif // DEBUG_LOG
         std::size_t num_thread = 1;
         switch(init_method)
         {
@@ -191,65 +191,71 @@ bool profile_grouped_gemm_impl(int do_verification,
         DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get()));
 
         gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
+        std::string gemm_name = gemm_ptr->GetTypeString();
 
-        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
+        if(kbatch > 1)
         {
-            std::string gemm_name = gemm_ptr->GetTypeString();
-
-            if(kbatch > 1)
+            using DeviceOpSplitK =
+                ck::tensor_operation::device::DeviceGroupedGemmSplitK<ALayout,
+                                                                      BLayout,
+                                                                      ck::Tuple<>,
+                                                                      CLayout,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      ck::Tuple<>,
+                                                                      CDataType,
+                                                                      AElementOp,
+                                                                      BElementOp,
+                                                                      CElementOp>;
+
+            if(dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get()) != nullptr)
             {
-                using DeviceOpSplitK =
-                    ck::tensor_operation::device::DeviceGroupedGemmSplitK<ALayout,
-                                                                          BLayout,
-                                                                          ck::Tuple<>,
-                                                                          CLayout,
-                                                                          ADataType,
-                                                                          BDataType,
-                                                                          ck::Tuple<>,
-                                                                          CDataType,
-                                                                          AElementOp,
-                                                                          BElementOp,
-                                                                          CElementOp>;
-
-                if(dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get()) != nullptr)
-                {
-                    dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
-                        ->SetKBatchSize(argument_ptr.get(), kbatch);
-                }
+                dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
+                    ->SetKBatchSize(argument_ptr.get(), kbatch);
             }
+        }
+
+        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
 
             float ave_time =
                 invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
-            std::size_t flop = 0, num_btype = 0;
-            for(std::size_t i = 0; i < gemm_descs.size(); i++)
+            if(time_kernel)
             {
-                flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
+                std::size_t flop = 0, num_btype = 0;
+                for(std::size_t i = 0; i < gemm_descs.size(); i++)
+                {
+                    flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
 
-                num_btype += sizeof(ADataType) * Ms[i] * Ks[i] + sizeof(BDataType) * Ks[i] * Ns[i] +
-                             sizeof(CDataType) * Ms[i] * Ns[i];
-            }
+                    num_btype += sizeof(ADataType) * Ms[i] * Ks[i] +
+                                 sizeof(BDataType) * Ks[i] * Ns[i] +
+                                 sizeof(CDataType) * Ms[i] * Ns[i];
+                }
 
-            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+                float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
 
-            float gb_per_sec = num_btype / 1.E6 / ave_time;
-            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
-                      << gb_per_sec << " GB/s, " << gemm_name << std::endl;
+                float gb_per_sec = num_btype / 1.E6 / ave_time;
+                std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
+                          << " TFlops, " << gb_per_sec << " GB/s, " << gemm_name << std::endl;
 
-            if(tflops > best_tflops)
-            {
-                best_gemm_name  = gemm_name;
-                best_tflops     = tflops;
-                best_ave_time   = ave_time;
-                best_gb_per_sec = gb_per_sec;
+                if(tflops > best_tflops)
+                {
+                    best_gemm_name  = gemm_name;
+                    best_tflops     = tflops;
+                    best_ave_time   = ave_time;
+                    best_gb_per_sec = gb_per_sec;
+                }
             }
 
             if(do_verification)
             {
+                bool instance_pass = true;
                 for(std::size_t i = 0; i < gemm_descs.size(); i++)
                 {
 
                     c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
+                    c_device_buf[i]->SetZero();
 
                     Tensor<CDataType> c_m_n_host_result(
                         f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}));
@@ -274,7 +280,20 @@ bool profile_grouped_gemm_impl(int do_verification,
                                                               c_element_op);
 
                     ref_invoker.Run(ref_argument);
-                    pass = pass && ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
+                    if(std::is_same_v<CDataType, ck::half_t> && kbatch > 1)
+                    {
+                        instance_pass =
+                            instance_pass && ck::utils::check_err(c_m_n_device_results[i],
+                                                                  c_m_n_host_result,
+                                                                  "Error: Incorrect results!",
+                                                                  0.06);
+                    }
+                    else
+                    {
+                        instance_pass =
+                            instance_pass &&
+                            ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
+                    }
 
                     if(do_log)
                     {
@@ -289,16 +308,25 @@ bool profile_grouped_gemm_impl(int do_verification,
                             << std::endl;
                     }
                 }
+
+                std::cout << "Instance: " << gemm_name << " verification "
+                          << (instance_pass ? "SUCCEED" : "FAILED") << std::endl;
+
+                pass = pass && instance_pass;
             }
         }
         else
         {
-            std::cout << "does not support this GEMM problem" << std::endl;
+            std::cout << "Instance: " << gemm_name << ", does not support this GEMM problem"
+                      << std::endl;
         }
     }
 
-    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
-              << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
+    if(time_kernel)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+                  << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
+    }
 
     return pass;
 }
diff --git a/profiler/include/profiler/profile_groupnorm_impl.hpp b/profiler/include/profiler/profile_groupnorm_impl.hpp
index 73343f6bec22db95ccaba37cb5c568018f630944..ebefe3dad42a91549b70ea26e171c40b78561eeb 100644
--- a/profiler/include/profiler/profile_groupnorm_impl.hpp
+++ b/profiler/include/profiler/profile_groupnorm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_layernorm_impl.hpp b/profiler/include/profiler/profile_layernorm_impl.hpp
index 7dd90d079775ffe2da37d4e63e8391e1ee3709d0..2d87c8c8fe98bb32c84d3ffb851e5bd9a4f51525 100644
--- a/profiler/include/profiler/profile_layernorm_impl.hpp
+++ b/profiler/include/profiler/profile_layernorm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
index c313a00bebf0a35d82262f5c7b1e24f5f2c7c6ce..0c888db1f47a6511de4822c1b27a23b2a2d6e814 100644
--- a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
index c9e4c193f89221d99b1119bcf5431a60d899dd23..41b57fd85332b6007e3c4d44ef11574ddf044498 100644
--- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_reduce_impl.hpp b/profiler/include/profiler/profile_reduce_impl.hpp
index e6182002999b8c9b0838dab70d387d92d59e6cdc..b54aa65aef727b048458e0b63f638bad143e4e28 100644
--- a/profiler/include/profiler/profile_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_reduce_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/include/profiler/profile_softmax_impl.hpp b/profiler/include/profiler/profile_softmax_impl.hpp
index 96816f53bbb7da01435569cb7aa6c8e2ea099586..65b4be2a609b7fef2920756734abfa9bc79f08a1 100644
--- a/profiler/include/profiler/profile_softmax_impl.hpp
+++ b/profiler/include/profiler/profile_softmax_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/profiler/src/profile_avg_pool2d_fwd.cpp b/profiler/src/profile_avg_pool2d_fwd.cpp
index b92288096d83eda37c75efe1cdaceb1603ef6cb7..c67897c04421085b69cb03e3ce97faf259d4f80b 100644
--- a/profiler/src/profile_avg_pool2d_fwd.cpp
+++ b/profiler/src/profile_avg_pool2d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profile_batched_gemm.cpp b/profiler/src/profile_batched_gemm.cpp
index 907a373794f2d8a59a066b94a591c4890c5c4038..dc83e25b48a13fe27a72b2b61ee59e18ed857ee6 100644
--- a/profiler/src/profile_batched_gemm.cpp
+++ b/profiler/src/profile_batched_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdint>
 #include <iostream>
diff --git a/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp b/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp
index f440a3094eb16b4eef1c5dee46cb08f6ab25d933..3d29c4b84aa5abfafbb220c46205413adb534770 100644
--- a/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp
+++ b/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_batched_gemm_gemm.cpp b/profiler/src/profile_batched_gemm_gemm.cpp
index 6015c93be35268c1c87c129d0cc7d2c9bdf0584f..9a99874d1c7041f692133a6ef2e792bdf0dd8105 100644
--- a/profiler/src/profile_batched_gemm_gemm.cpp
+++ b/profiler/src/profile_batched_gemm_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_batched_gemm_reduce.cpp b/profiler/src/profile_batched_gemm_reduce.cpp
index 6b1dfc01427a1cd34eefc89584bb6f3ce8bf206f..9620d63cafbeedfb5398b553b5172418923bb8e0 100644
--- a/profiler/src/profile_batched_gemm_reduce.cpp
+++ b/profiler/src/profile_batched_gemm_reduce.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_batchnorm_bwd.cpp b/profiler/src/profile_batchnorm_bwd.cpp
index 44ce7350ff06eaad3eff7d93f60c022d1a4c2b60..1738d53dbe7c6d2581e15df8bae58f7c48733999 100644
--- a/profiler/src/profile_batchnorm_bwd.cpp
+++ b/profiler/src/profile_batchnorm_bwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profile_batchnorm_fwd.cpp b/profiler/src/profile_batchnorm_fwd.cpp
index 902a1fc423f98880b43e1551f3305854d1a0b9b8..2b3e4eea41b5775d961a44e7b16a4ea79c67ffc9 100644
--- a/profiler/src/profile_batchnorm_fwd.cpp
+++ b/profiler/src/profile_batchnorm_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profile_batchnorm_infer.cpp b/profiler/src/profile_batchnorm_infer.cpp
index 92c16859c1aa59ef814fa87166d3d27c9b43a3a2..f1c19bc36e8aded070b3ba843d9fa7007f37a55e 100644
--- a/profiler/src/profile_batchnorm_infer.cpp
+++ b/profiler/src/profile_batchnorm_infer.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profile_conv_bwd_data.cpp b/profiler/src/profile_conv_bwd_data.cpp
index 9241ead738ef8a8a67d8682d743f6c9bd0faf640..465abacc4a024f7a8ed8fc0ac5feb0c0e6b13fc0 100644
--- a/profiler/src/profile_conv_bwd_data.cpp
+++ b/profiler/src/profile_conv_bwd_data.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_conv_fwd.cpp b/profiler/src/profile_conv_fwd.cpp
index b57ee7fd94261f4437d1b62f0b6450ea74522db9..701999d8a9fcd3718fc8d62e9b77f4f49159d474 100644
--- a/profiler/src/profile_conv_fwd.cpp
+++ b/profiler/src/profile_conv_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_conv_fwd_bias_relu.cpp b/profiler/src/profile_conv_fwd_bias_relu.cpp
index b44007cde4742efcec188f0b620e2cf83295f5b6..31055ec1d1530ff5fe9c89e89957fdd6e83f7efc 100644
--- a/profiler/src/profile_conv_fwd_bias_relu.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_conv_fwd_bias_relu_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
index 408dd02f78dd6241b5f0a67ae7aac4420c891943..8c2439a0c74be78b2d6ffcfdcce033c9ec0c7df5 100644
--- a/profiler/src/profile_conv_fwd_bias_relu_add.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm.cpp b/profiler/src/profile_gemm.cpp
index 61bae6ae70ebcf965618a100c3f553de73182118..b3587ea98d8555f86344621f3c8d48967b98ef46 100644
--- a/profiler/src/profile_gemm.cpp
+++ b/profiler/src/profile_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_add_add_fastgelu.cpp b/profiler/src/profile_gemm_add_add_fastgelu.cpp
index c3c0fb7b67daf131ab53194bbf28a12aeedf62aa..8af3768a48aa9905700b55ab73e83777f20f8223 100644
--- a/profiler/src/profile_gemm_add_add_fastgelu.cpp
+++ b/profiler/src/profile_gemm_add_add_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_add_fastgelu.cpp b/profiler/src/profile_gemm_add_fastgelu.cpp
index 380b25a614c58d42209135218cca331de3bebe33..a09bb8340d3268ed642b1cd15f318f9458867c99 100644
--- a/profiler/src/profile_gemm_add_fastgelu.cpp
+++ b/profiler/src/profile_gemm_add_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_add_multiply.cpp b/profiler/src/profile_gemm_add_multiply.cpp
index 7d6fead402f22a92d3b2c12b8d612c22582504ee..560467c264f2bcc8e406ed02de742c06942e27e0 100644
--- a/profiler/src/profile_gemm_add_multiply.cpp
+++ b/profiler/src/profile_gemm_add_multiply.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_add_relu_add_layernorm.cpp b/profiler/src/profile_gemm_add_relu_add_layernorm.cpp
index 5cbc3d21f8a335aaaf0353308a2fc996e210b520..558d255ce110565e90f6eff5fae86ce6bf6447bb 100644
--- a/profiler/src/profile_gemm_add_relu_add_layernorm.cpp
+++ b/profiler/src/profile_gemm_add_relu_add_layernorm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_bias_add_reduce.cpp b/profiler/src/profile_gemm_bias_add_reduce.cpp
index 6d86db08223a771ddc4373c014a6a2a42f92285b..76daffbc67cb95ac19d52a6791d90816a5e6656b 100644
--- a/profiler/src/profile_gemm_bias_add_reduce.cpp
+++ b/profiler/src/profile_gemm_bias_add_reduce.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_bilinear.cpp b/profiler/src/profile_gemm_bilinear.cpp
index 3480014ba6e6a2088c1ee1a221eb99eee6191241..a1a48616b42cac2f0babf59955a68cc49c323ac3 100644
--- a/profiler/src/profile_gemm_bilinear.cpp
+++ b/profiler/src/profile_gemm_bilinear.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_fastgelu.cpp b/profiler/src/profile_gemm_fastgelu.cpp
index 2a137224cb096f4bf2968f06be13d71d44614794..93573002ef1c367a00d42dc2ea0fe99fca98086a 100644
--- a/profiler/src/profile_gemm_fastgelu.cpp
+++ b/profiler/src/profile_gemm_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_reduce.cpp b/profiler/src/profile_gemm_reduce.cpp
index 395bf0627e43617d04d30a903a1f3cbdecf4a557..48f6f5eb49da151136600c2d03b99d959d51565f 100644
--- a/profiler/src/profile_gemm_reduce.cpp
+++ b/profiler/src/profile_gemm_reduce.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_gemm_splitk.cpp b/profiler/src/profile_gemm_splitk.cpp
index f636ce718c669feb23a60fb1853a1219fea3da55..cc2da73cb72704ffb2c1283976f6307b2e89908c 100644
--- a/profiler/src/profile_gemm_splitk.cpp
+++ b/profiler/src/profile_gemm_splitk.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/profiler/src/profile_grouped_conv_bwd_weight.cpp
index dfd8a099f5408f76fd51a9c69a988591edf5348f..7a062ed5197109474e6bdad07a33f8b88c5b12f8 100644
--- a/profiler/src/profile_grouped_conv_bwd_weight.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <initializer_list>
diff --git a/profiler/src/profile_grouped_conv_fwd.cpp b/profiler/src/profile_grouped_conv_fwd.cpp
index 9ff3c15af05a5f67988ef4842e00e2671289bb1b..d0b424cde64249aff7da0646a6bb11e83ccecf39 100644
--- a/profiler/src/profile_grouped_conv_fwd.cpp
+++ b/profiler/src/profile_grouped_conv_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp
index 34647adabfea27f96d76147e58799847727357ab..d023db54de47cdb4464148985547a80ad441190b 100644
--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_grouped_gemm_fastgelu.cpp b/profiler/src/profile_grouped_gemm_fastgelu.cpp
index 9b6142f01559542455b002f5254ca848ec7867a5..50ecf25caebdedb385d9ab523abe5f73f1afb8d5 100644
--- a/profiler/src/profile_grouped_gemm_fastgelu.cpp
+++ b/profiler/src/profile_grouped_gemm_fastgelu.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/profiler/src/profile_groupnorm.cpp b/profiler/src/profile_groupnorm.cpp
index d55529a0f033c36de7951b383a62c5ac656317f5..d55784ff0ad7274fce9bbf66b26a508f5994a578 100644
--- a/profiler/src/profile_groupnorm.cpp
+++ b/profiler/src/profile_groupnorm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profile_layernorm.cpp b/profiler/src/profile_layernorm.cpp
index e93fc2dbd2bf004389eed968e0518a2cdb1ea4f6..7bf210e67837fa8fe0fe8982c40a1793344823a0 100644
--- a/profiler/src/profile_layernorm.cpp
+++ b/profiler/src/profile_layernorm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profile_max_pool3d_fwd.cpp b/profiler/src/profile_max_pool3d_fwd.cpp
index 90c6e4e2b904d6f51ed8692fc359719364c9a88c..cf6db2cfc91e1b7d52af6cc7e9f9d43f4051599e 100644
--- a/profiler/src/profile_max_pool3d_fwd.cpp
+++ b/profiler/src/profile_max_pool3d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp
index 6925371858ee913dcb5d2b529c3e350a6eec5f4f..e4af5680a5d8a192edfc380d18fbe0a2c595cbbb 100644
--- a/profiler/src/profile_reduce.cpp
+++ b/profiler/src/profile_reduce.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <fstream>
diff --git a/profiler/src/profile_softmax.cpp b/profiler/src/profile_softmax.cpp
index 78b64dda7d70ec0a6cf8487562a9782c261f58c7..77007ad13b59376f6f26e72291793b1974c68957 100644
--- a/profiler/src/profile_softmax.cpp
+++ b/profiler/src/profile_softmax.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/profiler/src/profiler.cpp b/profiler/src/profiler.cpp
index 080117e390c4df1bc14195649382267bcccc18f7..0f528c008f1d17afd42139d572d4f1cf1963fadd 100644
--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/profiler/src/profiler_operation_registry.hpp b/profiler/src/profiler_operation_registry.hpp
index 91ff291233066527609133dc155e67388119385a..276b7b38dcd28ee935c108819cd0102b2bec4862 100644
--- a/profiler/src/profiler_operation_registry.hpp
+++ b/profiler/src/profiler_operation_registry.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <functional>
 #include <iostream>
diff --git a/test/batched_gemm/batched_gemm_bf16.cpp b/test/batched_gemm/batched_gemm_bf16.cpp
index 78be540627850426a075e2bde1516410ef070756..fa1652e99712f73de33e074958df65cf2d6b4f9e 100644
--- a/test/batched_gemm/batched_gemm_bf16.cpp
+++ b/test/batched_gemm/batched_gemm_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm/batched_gemm_fp16.cpp b/test/batched_gemm/batched_gemm_fp16.cpp
index 6cbbedf6774d2f90f85e5128c5bae8fbe912dc8e..3df4912aa0d2a4426634b421c0b09f428f543ad9 100644
--- a/test/batched_gemm/batched_gemm_fp16.cpp
+++ b/test/batched_gemm/batched_gemm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm/batched_gemm_fp32.cpp b/test/batched_gemm/batched_gemm_fp32.cpp
index c9e565e264b493d6f7f1b248fe6d307150a4c989..2d808441b9019f61f80aaabe8dc9afb5379489cd 100644
--- a/test/batched_gemm/batched_gemm_fp32.cpp
+++ b/test/batched_gemm/batched_gemm_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm/batched_gemm_int8.cpp b/test/batched_gemm/batched_gemm_int8.cpp
index 4da941a5766bc6decc5adc938b4ce39da5d8a6db..ed233a5ada1309466266dc17eefa1a54517500d6 100644
--- a/test/batched_gemm/batched_gemm_int8.cpp
+++ b/test/batched_gemm/batched_gemm_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
index aa113de219437e900b2de8739ddaffb2caf3cefa..1a8d5c2e55906fb77cfc3fef7d6c985c37eefefb 100644
--- a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "test_batched_gemm_gemm_util.hpp"
diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
index 53c4d37c44781be976f917362d0c433d1ec1711f..b0fffc466efd7de5f9a8055cb3f7e372bfad2ef7 100644
--- a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
index b150ce50d166d5a673c6b34c468b7988f3b4d91f..dd2638ce899bee16accfcfb85c402909b2c2a05c 100644
--- a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
+++ b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp
index 5df7769d5f675e906e350738751923d26b6af39d..cb46a995c61c4dc15c9fcbd9c2ca07b850f90069 100644
--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_util.hpp"
diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
index 98debe19c3c0b419f46dd08321f43f846536772a..d8ee744c6098843e9e78e21ac59a5f3733f40aa8 100644
--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp
index fe65a6fb9681a2a336132cf082cd9424e988e1ed..ef88ce6d81845d9ddb61fc00159899f0d11daa5f 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "test_batched_gemm_bias_softmax_gemm_permute_util.hpp"
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp
index 7235cd1b0b669cb1cfad6753289a94ad6a569df3..b38b10d1953091bc7d0739a38fa184d3d51a94f1 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
index af5f0efec3853eae68dfbb91f97930a1e8e5727b..d7c39367c8a19290718b315251581f4f3f626add 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp
index defe36124056d602f565a20333379fc86a61b6aa..8e0baede113943c0bcb7d6f929217b2be94f0073 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp
index 293acd60155ace98eaea89a11eb03fa6710fd877..81d404109fddc4321d870d1c3b8c0a2fe309c9c4 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
index 912bbc91edd51d06c827fc31495d2955bea85a2b..9df03ffd2a11211600872957e9686e105bf8b2f2 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/batchnorm/batchnorm_bwd_rank_4.cpp b/test/batchnorm/batchnorm_bwd_rank_4.cpp
index caa7331ea2c3e6dfe57a41272c5f4fba56662e39..a4696cf2a39a774c86d4a68f1a0edf0ce31a3f58 100644
--- a/test/batchnorm/batchnorm_bwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_bwd_rank_4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/test/batchnorm/batchnorm_fwd_rank_4.cpp b/test/batchnorm/batchnorm_fwd_rank_4.cpp
index 13aef7d6bfc9f222e663b951a2f9d44e81fc93bc..9b6fbd0f662648426aad3f9c316604510ff6e1b1 100644
--- a/test/batchnorm/batchnorm_fwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_fwd_rank_4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/test/batchnorm/batchnorm_infer_rank_4.cpp b/test/batchnorm/batchnorm_infer_rank_4.cpp
index 77fc1daae6130e415e88fc6f69966a82280acd3a..ecb4043b361b058ea82b91d98232eadb694ead33 100644
--- a/test/batchnorm/batchnorm_infer_rank_4.cpp
+++ b/test/batchnorm/batchnorm_infer_rank_4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/test/block_to_ctile_map/test_block_to_ctile_map.cpp b/test/block_to_ctile_map/test_block_to_ctile_map.cpp
index 55d9b59f489203cf68b5aebc1c5d9642801435c0..b8e349eda16f53d97b9655840f9803ef8e6a7ace 100644
--- a/test/block_to_ctile_map/test_block_to_ctile_map.cpp
+++ b/test/block_to_ctile_map/test_block_to_ctile_map.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
diff --git a/test/conv_util/conv_util.cpp b/test/conv_util/conv_util.cpp
index 73797a7169e78cfeabac101ac0c7da366187edbf..6922bbbcc73dca4d0bc9e98ae49e8b504adb860a 100644
--- a/test/conv_util/conv_util.cpp
+++ b/test/conv_util/conv_util.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <string>
diff --git a/test/convnd_bwd_data/convnd_bwd_data.cpp b/test/convnd_bwd_data/convnd_bwd_data.cpp
index 70231d42ae5f9eb9a368c643b2cf4041ae5fac4e..9d2b6cf5770144e25b21874e6f4da93c523e76bb 100644
--- a/test/convnd_bwd_data/convnd_bwd_data.cpp
+++ b/test/convnd_bwd_data/convnd_bwd_data.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/test/convnd_fwd/convnd_fwd.cpp b/test/convnd_fwd/convnd_fwd.cpp
index a1921a9bfbe0f1221ddada95c75835ab9c942292..fe8798ceb8b1086d7bea43008194845a47f6db43 100644
--- a/test/convnd_fwd/convnd_fwd.cpp
+++ b/test/convnd_fwd/convnd_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/test/data_type/int4.cpp b/test/data_type/int4.cpp
index 252a450bf96896e677cbc5f402ccfe722192d9bc..07549c1c48bb807c9e50520e173d528bf27b3bc9 100644
--- a/test/data_type/int4.cpp
+++ b/test/data_type/int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <bitset>
 #include <cinttypes>
diff --git a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
index e80995c4f08622432f4e6c3d3c5cf3200fa8c460..d5ce77dc2b956b02439a42eaa341285217780ac8 100644
--- a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
+++ b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_elementwise_layernorm_impl.hpp"
diff --git a/test/gemm/gemm_bf16.cpp b/test/gemm/gemm_bf16.cpp
index 5290d466323277daa4e61af1a8273b816a790120..cde5c45aeab59453488ea89e3942550531b65b87 100644
--- a/test/gemm/gemm_bf16.cpp
+++ b/test/gemm/gemm_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/test/gemm/gemm_fp16.cpp b/test/gemm/gemm_fp16.cpp
index 92e225def29623c7dcfc8590a5f805052dedb54d..cad250c6fb4d9eb935653d79999db3ff77a6932e 100644
--- a/test/gemm/gemm_fp16.cpp
+++ b/test/gemm/gemm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/test/gemm/gemm_fp32.cpp b/test/gemm/gemm_fp32.cpp
index 5d8c4881b621504fe621d056cbcf68bcb04622fc..c35aa77ea7dad4a7bf56e616d3456be932f44d8e 100644
--- a/test/gemm/gemm_fp32.cpp
+++ b/test/gemm/gemm_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_fp64.cpp
index 85d7f95bf4ad9ea8df19fdf2600c191f7eabcbf2..e67c8ba4f32331619fa5979c0e66226253993af1 100644
--- a/test/gemm/gemm_fp64.cpp
+++ b/test/gemm/gemm_fp64.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/test/gemm/gemm_int8.cpp b/test/gemm/gemm_int8.cpp
index e73b22ce9c847f704fe55ad31cc83661b1ebab87..6ece05e306e66cb1013a846f3bdfda6ba4a195f4 100644
--- a/test/gemm/gemm_int8.cpp
+++ b/test/gemm/gemm_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/test/gemm/gemm_standalone_xdl_fp16.cpp b/test/gemm/gemm_standalone_xdl_fp16.cpp
index 32a243e0f694c0fa1e57178642d08885dd69d6a3..201a49dcd390090c056ec911f70fee9bd61b61af 100644
--- a/test/gemm/gemm_standalone_xdl_fp16.cpp
+++ b/test/gemm/gemm_standalone_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_util.hpp"
 
diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp
index 9057c0af896891107683605575ee17aeb460c6cf..6c46f4ee89539b1e687b79f40803cfafa62528fc 100644
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/test/gemm/instance/gemm_f16_nn_instance.cpp b/test/gemm/instance/gemm_f16_nn_instance.cpp
index 4d65c5876cd92a04cf04a99d3f3655e50c586a8d..9016257f131ffa4a130c009256e66b1d5abe709a 100644
--- a/test/gemm/instance/gemm_f16_nn_instance.cpp
+++ b/test/gemm/instance/gemm_f16_nn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_f16_nn_instance.hpp b/test/gemm/instance/gemm_f16_nn_instance.hpp
index 5ae3928dc972f53eb1bb963cd2130e110ff19ae0..e174b99a1d8156540bdc3a0fcd16bb9870e96204 100644
--- a/test/gemm/instance/gemm_f16_nn_instance.hpp
+++ b/test/gemm/instance/gemm_f16_nn_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_f16_nt_instance.cpp b/test/gemm/instance/gemm_f16_nt_instance.cpp
index 431ff1e62e79fc126deac89194996f2db3e2cb01..27103b88d4c25cd8658e8e58f2b0616377ccb99c 100644
--- a/test/gemm/instance/gemm_f16_nt_instance.cpp
+++ b/test/gemm/instance/gemm_f16_nt_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_f16_nt_instance.hpp b/test/gemm/instance/gemm_f16_nt_instance.hpp
index 99f9ffba4562cd03baff678302e0900a1b842e32..c624425e69573e8293bb9b667f35cfee070c2806 100644
--- a/test/gemm/instance/gemm_f16_nt_instance.hpp
+++ b/test/gemm/instance/gemm_f16_nt_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_f16_tn_instance.cpp b/test/gemm/instance/gemm_f16_tn_instance.cpp
index 6f5dbc311ebb81501eedc218b43003c777424be2..5b11f4dad9000672b1c2cac01cdb5f1eb4a3ad34 100644
--- a/test/gemm/instance/gemm_f16_tn_instance.cpp
+++ b/test/gemm/instance/gemm_f16_tn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_f16_tn_instance.hpp b/test/gemm/instance/gemm_f16_tn_instance.hpp
index 62388aeb398dff03c1f997c5ece4d53560492abd..563e10600adde840ac967f5e980362d53e6460e9 100644
--- a/test/gemm/instance/gemm_f16_tn_instance.hpp
+++ b/test/gemm/instance/gemm_f16_tn_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_f16_tt_instance.cpp b/test/gemm/instance/gemm_f16_tt_instance.cpp
index b6ef5b1cd21a5314c83bedfcdef2d9c3d00906b9..9032150f0c5b7449244c23beea13df71d1e91ea5 100644
--- a/test/gemm/instance/gemm_f16_tt_instance.cpp
+++ b/test/gemm/instance/gemm_f16_tt_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_f16_tt_instance.hpp b/test/gemm/instance/gemm_f16_tt_instance.hpp
index 9d75b4e48cbba7905c3eebe2b30e8c663e31cbeb..62914d7ac22cb9a7ece1c321ef1a72368788ee2f 100644
--- a/test/gemm/instance/gemm_f16_tt_instance.hpp
+++ b/test/gemm/instance/gemm_f16_tt_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_wavelet_f16_tn_instance.cpp b/test/gemm/instance/gemm_wavelet_f16_tn_instance.cpp
index 51c014a91a4e35ce16bd59feb16683899e6b4909..983af7ecddfef2c28c2d58d2f4e62bf36dd74309 100644
--- a/test/gemm/instance/gemm_wavelet_f16_tn_instance.cpp
+++ b/test/gemm/instance/gemm_wavelet_f16_tn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
diff --git a/test/gemm/instance/gemm_wavelet_f16_tn_instance.hpp b/test/gemm/instance/gemm_wavelet_f16_tn_instance.hpp
index 110fc5f7d9a8e78507085c7bf870d8d733417076..ef269d78ee5e11c8813b2f4787583c34530d7b2d 100644
--- a/test/gemm/instance/gemm_wavelet_f16_tn_instance.hpp
+++ b/test/gemm/instance/gemm_wavelet_f16_tn_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <memory>
 #include <vector>
diff --git a/test/gemm/run_gemm_test.inc b/test/gemm/run_gemm_test.inc
index ec27729b3c9f02148cdd5537fe7f12e7421a7a4d..d208bb5a7b95a1775815dfa95f55befa5c401ea9 100644
--- a/test/gemm/run_gemm_test.inc
+++ b/test/gemm/run_gemm_test.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 int run_gemm_test()
 {
diff --git a/test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16.cpp b/test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16.cpp
index 740c63aa7ee5f7b6188872b11c14ab1e3bea9280..3f059968784cd908a2969221d52ff89a2db1400f 100644
--- a/test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16.cpp
+++ b/test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_gemm_add_relu_add_layernorm_impl.hpp"
diff --git a/test/gemm_reduce/gemm_reduce_fp16.cpp b/test/gemm_reduce/gemm_reduce_fp16.cpp
index 029165ece125abaa99dc1605bfbc2467b02ecc26..35a149f52c9f82634b4f63f837cbb6bbc808870e 100644
--- a/test/gemm_reduce/gemm_reduce_fp16.cpp
+++ b/test/gemm_reduce/gemm_reduce_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
diff --git a/test/gemm_split_k/CMakeLists.txt b/test/gemm_split_k/CMakeLists.txt
index 09bbf79389c334d5d9adc96b862014bd8c682d58..2274854f88f5143bf42ac00b6801588726c7e8d5 100644
--- a/test/gemm_split_k/CMakeLists.txt
+++ b/test/gemm_split_k/CMakeLists.txt
@@ -1,5 +1,4 @@
 if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
-   add_test_executable(test_gemm_split_k gemm_split_k.cpp)
-   target_link_libraries(test_gemm_split_k PRIVATE utility)
-   target_link_libraries(test_gemm_split_k PRIVATE device_gemm_splitk_instance)
+   add_gtest_executable(test_gemm_splitk test_gemm_splitk.cpp)
+   target_link_libraries(test_gemm_splitk PRIVATE utility device_gemm_splitk_instance)
 endif()
diff --git a/test/gemm_split_k/gemm_split_k.cpp b/test/gemm_split_k/gemm_split_k.cpp
deleted file mode 100644
index 1edb5769c6919817740e376d7c018781bb3930e2..0000000000000000000000000000000000000000
--- a/test/gemm_split_k/gemm_split_k.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-#include <initializer_list>
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/literals.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
-
-#include "ck/library/utility/host_gemm.hpp"
-
-enum struct GemmMatrixLayout
-{
-    MK_KN_MN, // 0
-    MK_NK_MN, // 1
-    KM_KN_MN, // 2
-    KM_NK_MN, // 3
-};
-
-template <typename T>
-static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
-{
-    float max_diff = 1e-6;
-
-    for(std::size_t i = 0; i < ref.mData.size(); ++i)
-    {
-        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
-        if(max_diff < diff)
-        {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-struct gemmArgs
-{
-    GemmMatrixLayout layout;
-    int M;
-    int N;
-    int K;
-    int StrideA;
-    int StrideB;
-    int StrideC;
-    int KBatch;
-};
-
-int test_gemm(const gemmArgs& args)
-{
-    using Row = ck::tensor_layout::gemm::RowMajor;
-    using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-    bool a_row_major, b_row_major, c_row_major;
-
-    switch(args.layout)
-    {
-    case GemmMatrixLayout::MK_KN_MN:
-        a_row_major = true;
-        b_row_major = true;
-        c_row_major = true;
-        break;
-    case GemmMatrixLayout::MK_NK_MN:
-        a_row_major = true;
-        b_row_major = false;
-        c_row_major = true;
-        break;
-    case GemmMatrixLayout::KM_KN_MN:
-        a_row_major = false;
-        b_row_major = true;
-        c_row_major = true;
-        break;
-    case GemmMatrixLayout::KM_NK_MN:
-        a_row_major = false;
-        b_row_major = false;
-        c_row_major = true;
-        break;
-    default: printf("not supported layout"); return 1;
-    }
-
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, bool row_major) {
-            using namespace ck::literals;
-
-            if(row_major)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
-
-    Tensor<float> a_m_k(f_host_tensor_descriptor(args.M, args.K, args.StrideA, a_row_major));
-    Tensor<float> b_k_n(f_host_tensor_descriptor(args.K, args.N, args.StrideB, b_row_major));
-    Tensor<float> c_m_n_host_result(
-        f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major));
-    Tensor<float> c_m_n_device_result(
-        f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major));
-
-    // init data
-    std::size_t num_thread = 1;
-    a_m_k.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
-    b_k_n.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
-    // set zero to c_device_buf
-    c_m_n_device_result.GenerateTensorValue(GeneratorTensor_0<float>{}, num_thread);
-
-    host_gemm_mk_kn_mn(a_m_k,
-                       b_k_n,
-                       c_m_n_host_result,
-                       ck::tensor_operation::element_wise::PassThrough{},
-                       ck::tensor_operation::element_wise::PassThrough{},
-                       ck::tensor_operation::element_wise::PassThrough{});
-
-    DeviceMem a_device_buf(sizeof(float) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(float) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem c_device_buf(sizeof(float) * c_m_n_device_result.mDesc.GetElementSpaceSize());
-
-    a_device_buf.ToDevice(a_m_k.mData.data());
-    b_device_buf.ToDevice(b_k_n.mData.data());
-    c_device_buf.ToDevice(c_m_n_device_result.mData.data());
-
-    auto test = [&](auto a_layout, auto b_layout, auto c_layout) {
-        bool success = false;
-
-        using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<decltype(a_layout),
-                                                                        decltype(b_layout),
-                                                                        decltype(c_layout),
-                                                                        float,
-                                                                        float,
-                                                                        float,
-                                                                        PassThrough,
-                                                                        PassThrough,
-                                                                        PassThrough>;
-
-        const auto gemm_ptrs =
-            ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-                DeviceOp>::GetInstances();
-
-        for(auto& gemm_ptr : gemm_ptrs)
-        {
-            auto argument_ptr =
-                gemm_ptr->MakeArgumentPointer(static_cast<float*>(a_device_buf.GetDeviceBuffer()),
-                                              static_cast<float*>(b_device_buf.GetDeviceBuffer()),
-                                              static_cast<float*>(c_device_buf.GetDeviceBuffer()),
-                                              args.M,
-                                              args.N,
-                                              args.K,
-                                              args.StrideA,
-                                              args.StrideB,
-                                              args.StrideC,
-                                              ck::tensor_operation::element_wise::PassThrough{},
-                                              ck::tensor_operation::element_wise::PassThrough{},
-                                              ck::tensor_operation::element_wise::PassThrough{},
-                                              args.KBatch);
-
-            auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
-
-            if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
-            {
-                invoker_ptr->Run(argument_ptr.get());
-
-                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
-
-                if(!check_out(c_m_n_host_result, c_m_n_device_result))
-                {
-                    success = false;
-                    break;
-                }
-                success = true;
-            }
-        }
-
-        return success;
-    };
-
-    bool success = false;
-
-    if(args.layout == GemmMatrixLayout::MK_KN_MN)
-    {
-        success = test(Row{}, Row{}, Row{});
-    }
-    else if(args.layout == GemmMatrixLayout::MK_NK_MN)
-    {
-        success = test(Row{}, Col{}, Row{});
-    }
-    else if(args.layout == GemmMatrixLayout::KM_KN_MN)
-    {
-        success = test(Col{}, Row{}, Row{});
-    }
-    else
-    {
-        success = test(Col{}, Col{}, Row{});
-    }
-
-    auto error_code = 0;
-    if(success)
-    {
-        std::cout << "test split k : Pass" << std::endl;
-    }
-    else
-    {
-        std::cout << "test split k: Fail " << std::endl;
-        error_code = -1; // test needs to report failure
-    }
-    return error_code;
-}
-
-int main(int argc, char* argv[])
-{
-    std::vector<gemmArgs> test_cases;
-    if(argc == 1)
-    {
-        test_cases = {{GemmMatrixLayout::MK_KN_MN, 1024, 1024, 1024, 1024, 1024, 1024, 2},
-                      {GemmMatrixLayout::MK_KN_MN, 1024, 1024, 1024, 1024, 1024, 1024, 8}};
-    }
-    else if(argc == 9)
-    {
-        const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[1]));
-
-        const int M = std::stoi(argv[2]);
-        const int N = std::stoi(argv[3]);
-        const int K = std::stoi(argv[4]);
-
-        const int StrideA = std::stoi(argv[5]);
-        const int StrideB = std::stoi(argv[6]);
-        const int StrideC = std::stoi(argv[7]);
-        const int KBatch  = std::stoi(argv[8]);
-        test_cases        = {{layout, M, N, K, StrideA, StrideB, StrideC, KBatch}};
-    }
-    else
-    {
-        printf("arg1: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
-        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
-        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
-        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
-        printf("arg2 to 7: M, N, K, StrideA, StrideB, StrideC KBatch\n");
-        return -1;
-    }
-    bool error = false;
-    for(const auto& kinder : test_cases)
-    {
-        error |= test_gemm(kinder);
-    }
-    return error ? 1 : 0;
-}
diff --git a/test/gemm_split_k/test_gemm_splitk.cpp b/test/gemm_split_k/test_gemm_splitk.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9eba5bba37551f39ffe921406ba871cff8dab4aa
--- /dev/null
+++ b/test/gemm_split_k/test_gemm_splitk.cpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_splitk_util.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmSplitK_MK_KN
+    : public ck::test::TestGemmSplitK<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmSplitK_MK_NK
+    : public ck::test::TestGemmSplitK<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmSplitK_KM_KN
+    : public ck::test::TestGemmSplitK<typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmSplitK_KM_NK
+    : public ck::test::TestGemmSplitK<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //         ADataType, BDataType, CDataType
+    std::tuple<      F16,       F16,       F16>,
+    std::tuple<      F32,       F32,       F32>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmSplitK_MK_KN, KernelTypes);
+TYPED_TEST_SUITE(TestGemmSplitK_MK_NK, KernelTypes);
+TYPED_TEST_SUITE(TestGemmSplitK_KM_KN, KernelTypes);
+TYPED_TEST_SUITE(TestGemmSplitK_KM_NK, KernelTypes);
+
+#include "test_gemm_splitk_ut_cases.inc"
diff --git a/test/gemm_split_k/test_gemm_splitk_ut_cases.inc b/test/gemm_split_k/test_gemm_splitk_ut_cases.inc
new file mode 100644
index 0000000000000000000000000000000000000000..54b9c6c9e314870446b75b9946cd2980ab008457
--- /dev/null
+++ b/test/gemm_split_k/test_gemm_splitk_ut_cases.inc
@@ -0,0 +1,217 @@
+#pragma once
+
+TYPED_TEST(TestGemmSplitK_MK_KN, SmallM)
+{
+    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_MK_NK, SmallM)
+{
+    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_KN, SmallM)
+{
+    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_NK, SmallM)
+{
+    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_MK_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_MK_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_MK_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_MK_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_MK_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_MK_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmSplitK_KM_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
diff --git a/test/gemm_split_k/test_gemm_splitk_util.hpp b/test/gemm_split_k/test_gemm_splitk_util.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8243747a694141b373cd6d6896dd17213fd3192c
--- /dev/null
+++ b/test/gemm_split_k/test_gemm_splitk_util.hpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include <sstream>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "include/ck/utility/data_type.hpp"
+#include "profiler/profile_gemm_splitk_impl.hpp"
+
+namespace ck {
+namespace test {
+
+template <typename Tuple>
+class TestGemmSplitK : public testing::Test
+{
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using F32 = float;
+
+    protected:
+    using ALayout   = std::tuple_element_t<0, Tuple>;
+    using BLayout   = std::tuple_element_t<1, Tuple>;
+    using CLayout   = Row;
+    using ADataType = std::tuple_element_t<2, Tuple>;
+    using BDataType = std::tuple_element_t<3, Tuple>;
+    using CDataType = std::tuple_element_t<4, Tuple>;
+
+    public:
+    static constexpr bool verify_     = true;
+    static constexpr int init_method_ = 1; // decimal value initialization
+    static constexpr bool log_        = false;
+    static constexpr bool bench_      = false; // measure kernel performance
+    std::vector<int> k_batches_;
+
+    void SetUp() override { k_batches_ = {1, 2, 3, 5, 8}; }
+
+    void Run(const int M,
+             const int N,
+             const int K,
+             const int StrideA,
+             const int StrideB,
+             const int StrideC)
+    {
+        for(auto kb : k_batches_)
+        {
+            RunSingle(M, N, K, StrideA, StrideB, StrideC, kb);
+        }
+    }
+
+    void RunSingle(const int M,
+                   const int N,
+                   const int K,
+                   const int StrideA,
+                   const int StrideB,
+                   const int StrideC,
+                   int kbatch = 1)
+    {
+        bool pass = ck::profiler::profile_gemm_splitk_impl<ADataType,
+                                                           BDataType,
+                                                           F32,
+                                                           CDataType,
+                                                           ALayout,
+                                                           BLayout,
+                                                           CLayout>(
+            verify_, init_method_, log_, bench_, M, N, K, StrideA, StrideB, StrideC, kbatch);
+        EXPECT_TRUE(pass);
+    }
+};
+
+} // namespace test
+} // namespace ck
diff --git a/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp b/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
index 75f934cc06e23a3d9824e0df00c936333608a280..207cdab7c7eec32086898bd8f84e8c3d29fb4d79 100644
--- a/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp b/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
index 6df7f9969cb5d802ae28d0a7fdf03221c6c91c4f..4a804ef7f1af5abe4aa040867335043a726189d1 100644
--- a/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
diff --git a/test/grouped_gemm/CMakeLists.txt b/test/grouped_gemm/CMakeLists.txt
index a7619eac6e731673e153f74de0aaf2e74e907855..40f634d8b3819a40898041242462be79b875b9b1 100644
--- a/test/grouped_gemm/CMakeLists.txt
+++ b/test/grouped_gemm/CMakeLists.txt
@@ -1,5 +1,9 @@
 if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
-   add_test_executable(test_grouped_gemm_fp16 grouped_gemm_fp16.cpp)
-   target_link_libraries(test_grouped_gemm_fp16 PRIVATE utility)
-   target_link_libraries(test_grouped_gemm_fp16 PRIVATE device_grouped_gemm_instance)
+   add_custom_target(test_grouped_gemm)
+   add_gtest_executable(test_grouped_gemm_splitk test_grouped_gemm_splitk.cpp)
+   add_gtest_executable(test_grouped_gemm_interface test_grouped_gemm_interface.cpp)
+   target_link_libraries(test_grouped_gemm_splitk PRIVATE utility device_grouped_gemm_instance)
+   target_link_libraries(test_grouped_gemm_interface PRIVATE utility device_grouped_gemm_instance)
+   
+   add_dependencies(test_grouped_gemm test_grouped_gemm_splitk test_grouped_gemm_interface)
 endif()
diff --git a/test/grouped_gemm/grouped_gemm_fp16.cpp b/test/grouped_gemm/grouped_gemm_fp16.cpp
deleted file mode 100644
index f20d750d361040de1a4e5006d8ed25c256945174..0000000000000000000000000000000000000000
--- a/test/grouped_gemm/grouped_gemm_fp16.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-#include <random>
-
-#include "profiler/profile_grouped_gemm_impl.hpp"
-
-namespace {
-
-using ADataType   = ck::half_t;
-using BDataType   = ck::half_t;
-using CDataType   = ck::half_t;
-using AccDataType = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-template <typename ALayout, typename BLayout, typename CLayout>
-bool TestGroupedGemm()
-{
-
-    std::mt19937 gen(19391);
-    std::uniform_int_distribution<> distrib(1, 10);
-    int group_count = distrib(gen);
-
-    // GEMM shape
-    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
-    std::vector<const void*> p_a, p_b;
-    std::vector<void*> p_c;
-
-    std::vector<int> Ms, Ns, Ks, StrideAs, StrideBs, StrideCs;
-
-    for(int i = 0; i < group_count; i++)
-    {
-        Ms.push_back(256 + 256 * distrib(gen));
-        Ns.push_back(256 + 256 * distrib(gen));
-        Ks.push_back(128 + 128 * distrib(gen));
-
-        StrideAs.push_back(std::is_same<Row, ALayout>::value ? Ks[i] : Ms[i]);
-        StrideBs.push_back(std::is_same<Row, BLayout>::value ? Ns[i] : Ks[i]);
-        StrideCs.push_back(std::is_same<Row, CLayout>::value ? Ns[i] : Ms[i]);
-    }
-
-    return ck::profiler::profile_grouped_gemm_impl<ADataType,
-                                                   BDataType,
-                                                   CDataType,
-                                                   AccDataType,
-                                                   ALayout,
-                                                   BLayout,
-                                                   CLayout>(
-        true, 1, false, 1, Ms, Ns, Ks, StrideAs, StrideBs, StrideCs);
-}
-
-} // anonymous namespace
-
-int main()
-{
-    bool res = true;
-
-    res = res && TestGroupedGemm<Row, Row, Row>();
-    res = res && TestGroupedGemm<Row, Col, Row>();
-    res = res && TestGroupedGemm<Col, Row, Row>();
-    res = res && TestGroupedGemm<Col, Col, Row>();
-
-    std::cout << "TestGroupedGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
-
-    return res ? 0 : 1;
-}
diff --git a/test/grouped_gemm/test_grouped_gemm_interface.cpp b/test/grouped_gemm/test_grouped_gemm_interface.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ffa8840fc7d8ba6b8a1c1a39c62edc6edae58064
--- /dev/null
+++ b/test/grouped_gemm/test_grouped_gemm_interface.cpp
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <stdexcept>
+#include <vector>
+#include "gtest/gtest.h"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "test_grouped_gemm_util.hpp"
+
+class TestGGemmSplitKInterface_MKNKMN : public ::testing::Test
+{
+    protected:
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    using ALayout = Row;
+    using BLayout = Col;
+    using ELayout = Row;
+
+    static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+    template <ck::tensor_operation::device::GemmSpecialization GemmSpec,
+              ck::index_t KPerBlock,
+              ck::index_t K1,
+              ck::index_t ABlockTransferSrcScalarPerVector,
+              ck::index_t BBlockTransferSrcScalarPerVector,
+              ck::index_t CDEBlockTransferScalarPerVector_NPerBlock>
+    using GGemmInstance =
+        ck::test::DeviceGroupedGemmSplitkInstanceWrapper<ALayout,
+                                                         BLayout,
+                                                         ELayout,
+                                                         GemmSpec,
+                                                         KPerBlock,
+                                                         K1,
+                                                         ABlockTransferSrcScalarPerVector,
+                                                         BBlockTransferSrcScalarPerVector,
+                                                         CDEBlockTransferScalarPerVector_NPerBlock>;
+
+    using DefaultGGemmInstance = GGemmInstance<GemmDefault, 32, 8, 4, 8, 8>;
+};
+
+TEST_F(TestGGemmSplitKInterface_MKNKMN, TileSize)
+{
+    std::vector<int> Ms{128, 256, 188, 512};
+    constexpr int N = 256;
+    constexpr int K = 128;
+
+    std::vector<int> Ns(Ms.size(), N);
+    std::vector<int> Ks(Ms.size(), K);
+    std::vector<int> StrideAs(Ms.size(), K);
+    std::vector<int> StrideBs(Ms.size(), K);
+    std::vector<int> StrideCs(Ms.size(), N);
+
+    // M % MPerBlock
+    EXPECT_FALSE(DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+
+    Ms = std::vector<int>{256, 128, 128, 512};
+    Ns = std::vector<int>{256, 177, 128, 512};
+    // N % NPerBlock
+    EXPECT_FALSE(DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+}
+
+TEST_F(TestGGemmSplitKInterface_MKNKMN, VectorLoadWidth)
+{
+    static constexpr auto GemmMNKPadding =
+        ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+    using PaddedGGemmInstance = GGemmInstance<GemmMNKPadding, 32, 8, 4, 8, 8>;
+
+    std::vector<int> Ms{128, 256, 256, 512};
+    constexpr int N = 256;
+    constexpr int K = 512;
+
+    std::vector<int> Ns(Ms.size(), N);
+    std::vector<int> Ks(Ms.size(), K);
+    std::vector<int> StrideAs(Ms.size(), K);
+    std::vector<int> StrideBs(Ms.size(), K);
+    std::vector<int> StrideCs(Ms.size(), N);
+
+    // K % ABlockTransferSrcScalarPerVector
+    Ks = std::vector<int>{256, 177, 128, 512};
+    EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+
+    Ks = std::vector<int>{256, 164, 128, 512};
+    // K % BBlockTransferSrcScalarPerVector
+    EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+
+    Ks = std::vector<int>(4, 128);
+    Ns = std::vector<int>{256, 127, 128, 512};
+    // N % CBlockTransferScalarPerVector_NWaveNPerXDL
+    EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+}
+
+TEST_F(TestGGemmSplitKInterface_MKNKMN, KLoops)
+{
+    std::vector<int> Ms{128, 256, 256, 512};
+    constexpr int N      = 256;
+    constexpr int K      = 128;
+    constexpr int kbatch = 4;
+
+    std::vector<int> Ns(Ms.size(), N);
+    std::vector<int> Ks(Ms.size(), K);
+    std::vector<int> StrideAs(Ms.size(), K);
+    std::vector<int> StrideBs(Ms.size(), K);
+    std::vector<int> StrideCs(Ms.size(), N);
+
+    // kloops % 2
+    Ks = std::vector<int>{256, 512, 320, 768};
+    EXPECT_FALSE(
+        DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch));
+
+    // Not all gemms have same value for main_k0_block_loop!
+    Ks = std::vector<int>{256, 512, 512, 512};
+    EXPECT_THROW(DefaultGGemmInstance{}.Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch),
+                 std::runtime_error);
+}
+
+class TestGGemmSplitKInterface_KMKNNM : public ::testing::Test
+{
+    protected:
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    using ALayout = Col;
+    using BLayout = Row;
+    using ELayout = Col;
+
+    static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+    template <ck::tensor_operation::device::GemmSpecialization GemmSpec,
+              ck::index_t KPerBlock,
+              ck::index_t K1,
+              ck::index_t ABlockTransferSrcScalarPerVector,
+              ck::index_t BBlockTransferSrcScalarPerVector,
+              ck::index_t CDEBlockTransferScalarPerVector_NPerBlock>
+    using GGemmInstance =
+        ck::test::DeviceGroupedGemmSplitkInstanceWrapper<ALayout,
+                                                         BLayout,
+                                                         ELayout,
+                                                         GemmSpec,
+                                                         KPerBlock,
+                                                         K1,
+                                                         ABlockTransferSrcScalarPerVector,
+                                                         BBlockTransferSrcScalarPerVector,
+                                                         CDEBlockTransferScalarPerVector_NPerBlock>;
+
+    using DefaultGGemmInstance = GGemmInstance<GemmDefault, 32, 8, 4, 8, 4>;
+};
+
+TEST_F(TestGGemmSplitKInterface_KMKNNM, TileSize)
+{
+    std::vector<int> Ms{128, 256, 188, 512};
+    constexpr int N = 256;
+    constexpr int K = 128;
+
+    std::vector<int> Ns(Ms.size(), N);
+    std::vector<int> Ks(Ms.size(), K);
+    std::vector<int> StrideAs(Ms.size(), K);
+    std::vector<int> StrideBs(Ms.size(), K);
+    std::vector<int> StrideCs(Ms.size(), N);
+
+    // M % MPerBlock
+    EXPECT_FALSE(DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+
+    Ms = std::vector<int>{128, 256, 256, 512};
+    Ns = std::vector<int>{256, 177, 128, 512};
+    // N % NPerBlock
+    EXPECT_FALSE(DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+}
+
+TEST_F(TestGGemmSplitKInterface_KMKNNM, VectorLoadWidth)
+{
+    static constexpr auto GemmMNKPadding =
+        ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+    using PaddedGGemmInstance = GGemmInstance<GemmMNKPadding, 32, 8, 2, 8, 4>;
+
+    std::vector<int> Ms{128, 256, 256, 512};
+    constexpr int N = 256;
+    constexpr int K = 512;
+
+    std::vector<int> Ns(Ms.size(), N);
+    std::vector<int> Ks(Ms.size(), K);
+    std::vector<int> StrideAs(Ms.size(), K);
+    std::vector<int> StrideBs(Ms.size(), K);
+    std::vector<int> StrideCs(Ms.size(), N);
+
+    // M % ABlockTransferSrcScalarPerVector
+    Ms = std::vector<int>{256, 177, 128, 512};
+    EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+
+    Ms = std::vector<int>{128, 256, 256, 512};
+    Ns = std::vector<int>{256, 164, 128, 512};
+    // N % BBlockTransferSrcScalarPerVector
+    EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+
+    Ns = std::vector<int>{128, 256, 256, 512};
+    Ms = std::vector<int>{256, 130, 128, 512};
+    // M % CBlockTransferScalarPerVector_NWaveNPerXDL
+    EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
+}
diff --git a/test/grouped_gemm/test_grouped_gemm_splitk.cpp b/test/grouped_gemm/test_grouped_gemm_splitk.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9282fa924943c52d317f1fe89b45c90e54c3afb
--- /dev/null
+++ b/test/grouped_gemm/test_grouped_gemm_splitk.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "gtest/gtest.h"
+#include "test_grouped_gemm_util.hpp"
+
+using F16 = ck::half_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using RRR_F16_F16_F16 = ck::test::TestGroupedGemm<std::tuple<Row, Row, Row, F16, F16, F16>>;
+using RCR_F16_F16_F16 = ck::test::TestGroupedGemm<std::tuple<Row, Col, Row, F16, F16, F16>>;
+
+using RRR_F16_F16_F16_LargeK = ck::test::TestGroupedGemm<std::tuple<Row, Row, Row, F16, F16, F16>>;
+using RCR_F16_F16_F16_LargeK = ck::test::TestGroupedGemm<std::tuple<Row, Col, Row, F16, F16, F16>>;
+
+const std::vector<int> KBATCH{1, 2, 3, 5, 8};
+
+INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_MK_KN, RRR_F16_F16_F16, testing::ValuesIn(KBATCH));
+INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_MK_NK, RCR_F16_F16_F16, testing::ValuesIn(KBATCH));
+INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_LargeK_MK_KN,
+                         RRR_F16_F16_F16_LargeK,
+                         testing::Values(32, 64));
+INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_LargeK_MK_NK,
+                         RCR_F16_F16_F16_LargeK,
+                         testing::Values(32, 64));
+
+#include "test_grouped_gemm_ut_cases.inc"
diff --git a/test/grouped_gemm/test_grouped_gemm_ut_cases.inc b/test/grouped_gemm/test_grouped_gemm_ut_cases.inc
new file mode 100644
index 0000000000000000000000000000000000000000..d94d140d97ecfb1ce0d879af747fc2badaba6ecc
--- /dev/null
+++ b/test/grouped_gemm/test_grouped_gemm_ut_cases.inc
@@ -0,0 +1,180 @@
+#pragma once
+
+TEST_P(RRR_F16_F16_F16, TinyCases)
+{
+    const std::vector<int> Ms{0, 1};
+    constexpr int N = 768;
+    constexpr int K = 544;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), N);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RRR_F16_F16_F16, SmallCases)
+{
+    const std::vector<int> Ms{2, 1, 3, 4, 5, 0};
+    constexpr int N = 768;
+    constexpr int K = 544;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), N);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RRR_F16_F16_F16, MidCases)
+{
+    const std::vector<int> Ms{167, 183, 177, 153, 139, 204};
+    constexpr int N = 768;
+    constexpr int K = 544;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), N);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RRR_F16_F16_F16, Regular)
+{
+    const std::vector<int> Ms{64, 128, 256};
+    constexpr int N = 768;
+    constexpr int K = 320;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), N);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RRR_F16_F16_F16, MNKPadded)
+{
+    const std::vector<int> Ms{127, 150, 188, 210};
+    constexpr int N = 136;
+    constexpr int K = 280;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), N);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RCR_F16_F16_F16, TinyCases)
+{
+    const std::vector<int> Ms{0, 1};
+    constexpr int N = 768;
+    constexpr int K = 544;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), K);
+    const std::vector<int> StrideCs(Ms.size(), N);
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RCR_F16_F16_F16, SmallCases)
+{
+    const std::vector<int> Ms{2, 1, 3, 4, 5, 0};
+    constexpr int N = 768;
+    constexpr int K = 544;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), K);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RCR_F16_F16_F16, MidCases)
+{
+    const std::vector<int> Ms{167, 183, 177, 153, 139, 204};
+    constexpr int N = 768;
+    constexpr int K = 544;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), K);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RCR_F16_F16_F16, Regular)
+{
+    const std::vector<int> Ms{32, 64, 128, 256};
+    constexpr int N = 768;
+    constexpr int K = 320;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), K);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RCR_F16_F16_F16, MNKPadded)
+{
+    const std::vector<int> Ms{127, 150, 188, 210};
+    constexpr int N = 136;
+    constexpr int K = 280;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), K);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RRR_F16_F16_F16_LargeK, TestLargeKBatch)
+{
+    const std::vector<int> Ms{188, 210};
+    constexpr int N = 768;
+    constexpr int K = 4096;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), N);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
+
+TEST_P(RCR_F16_F16_F16_LargeK, TestLargeKBatch)
+{
+    const std::vector<int> Ms{188, 210};
+    constexpr int N = 768;
+    constexpr int K = 4096;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+    const std::vector<int> StrideAs(Ms.size(), K);
+    const std::vector<int> StrideBs(Ms.size(), K);
+    const std::vector<int> StrideCs(Ms.size(), N);
+
+    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+}
diff --git a/test/grouped_gemm/test_grouped_gemm_util.hpp b/test/grouped_gemm/test_grouped_gemm_util.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b61118b5120e577defe9453c741875cafe33d168
--- /dev/null
+++ b/test/grouped_gemm/test_grouped_gemm_util.hpp
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include <string>
+#include <sstream>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/ck.hpp"
+#include "ck/stream_config.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/utility/tuple.hpp"
+#include "ck/utility/number.hpp"
+#include "profiler/profile_grouped_gemm_impl.hpp"
+
+namespace ck {
+namespace test {
+
+template <typename Range>
+std::string serialize_range(const Range& range)
+{
+    std::stringstream ss;
+    for(auto& r : range)
+    {
+        ss << r << ", ";
+    }
+    std::string str = ss.str();
+    return std::string(str.begin(), str.end() - 2);
+}
+
+template <typename Tuple>
+class TestGroupedGemm : public testing::TestWithParam<int>
+{
+    protected:
+    using ALayout   = std::tuple_element_t<0, Tuple>;
+    using BLayout   = std::tuple_element_t<1, Tuple>;
+    using ELayout   = std::tuple_element_t<2, Tuple>;
+    using ADataType = std::tuple_element_t<3, Tuple>;
+    using BDataType = std::tuple_element_t<4, Tuple>;
+    using EDataType = std::tuple_element_t<5, Tuple>;
+
+    public:
+    static constexpr bool verify_     = true;
+    static constexpr int init_method_ = 1; // decimal value initialization
+    static constexpr bool log_        = false;
+    static constexpr bool bench_      = false; // measure kernel performance
+
+    void SetUp() override {}
+
+    void Run(const std::vector<int>& Ms,
+             const std::vector<int>& Ns,
+             const std::vector<int>& Ks,
+             const std::vector<int>& StrideAs,
+             const std::vector<int>& StrideBs,
+             const std::vector<int>& StrideCs,
+             int kbatch = 1)
+    {
+        bool pass = ck::profiler::profile_grouped_gemm_impl<ADataType,
+                                                            BDataType,
+                                                            EDataType,
+                                                            float,
+                                                            ALayout,
+                                                            BLayout,
+                                                            ELayout>(
+            verify_, init_method_, log_, bench_, Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch);
+        EXPECT_TRUE(pass);
+    }
+};
+
+template <typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          ck::index_t KPerBlock,
+          ck::index_t K1,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock>
+struct DeviceGroupedGemmSplitkInstanceWrapper
+{
+    using F16         = half_t;
+    using F32         = float;
+    using Row         = ck::tensor_layout::gemm::RowMajor;
+    using Col         = ck::tensor_layout::gemm::ColumnMajor;
+    using PassThrough = tensor_operation::element_wise::PassThrough;
+
+    using EmptyTuple = ck::Tuple<>;
+
+    template <ck::index_t... Is>
+    using S = ck::Sequence<Is...>;
+
+    template <ck::index_t N>
+    using I = ck::Number<N>;
+
+    using ABlockTransferThreadClusterArrageOrder =
+        std::conditional_t<std::is_same_v<ALayout, Row>, S<0, 2, 1, 3>, S<0, 1, 3, 2>>;
+    using ABlockTransferSrcAccessOrder =
+        std::conditional_t<std::is_same_v<ALayout, Row>, S<0, 2, 1, 3>, S<0, 1, 3, 2>>;
+    using ABlockTransferSrcVectorDim = std::conditional_t<std::is_same_v<ALayout, Row>, I<3>, I<2>>;
+    using ABlockTransferDstScalarPerVector_K1 =
+        std::conditional_t<std::is_same_v<ALayout, Row>, I<8>, I<2>>;
+    using ABlockLdsAddExtraM = std::conditional_t<std::is_same_v<ALayout, Row>, I<1>, I<0>>;
+
+    using BBlockTransferThreadClusterArrageOrder =
+        std::conditional_t<std::is_same_v<BLayout, Row>, S<0, 1, 3, 2>, S<0, 2, 1, 3>>;
+    using BBlockTransferSrcAccessOrder =
+        std::conditional_t<std::is_same_v<BLayout, Row>, S<0, 1, 3, 2>, S<0, 2, 1, 3>>;
+    using BBlockTransferSrcVectorDim = std::conditional_t<std::is_same_v<BLayout, Row>, I<2>, I<3>>;
+    using BBlockTransferDstScalarPerVector_K1 =
+        std::conditional_t<std::is_same_v<ALayout, Row>, I<2>, I<8>>;
+    using BBlockLdsAddExtraM = std::conditional_t<std::is_same_v<ALayout, Row>, I<0>, I<1>>;
+
+    using DeviceGroupedGemmSplitKInstance =
+        tensor_operation::device::DeviceGroupedGemmXdlSplitKCShuffle<
+            ALayout,
+            BLayout,
+            EmptyTuple,
+            ELayout,
+            F16,
+            F16,
+            F32,
+            F16,
+            EmptyTuple,
+            F16,
+            PassThrough,
+            PassThrough,
+            PassThrough,
+            GemmSpec,
+            1,
+            128,
+            128,
+            128,
+            KPerBlock,
+            K1,
+            K1,
+            32,
+            32,
+            4,
+            2,
+            S<1, 4, 32, 1>,
+            ABlockTransferThreadClusterArrageOrder,
+            ABlockTransferSrcAccessOrder,
+            ABlockTransferSrcVectorDim::value,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_K1::value,
+            ABlockLdsAddExtraM::value,
+            S<1, 4, 32, 1>,
+            BBlockTransferThreadClusterArrageOrder,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorDim::value,
+            BBlockTransferSrcScalarPerVector,
+            BBlockTransferDstScalarPerVector_K1::value,
+            BBlockLdsAddExtraM::value,
+            1,
+            1,
+            S<1, 16, 1, 8>,
+            CDEBlockTransferScalarPerVector_NPerBlock>;
+
+    bool IsSupported(const std::vector<int>& Ms,
+                     const std::vector<int>& Ns,
+                     const std::vector<int>& Ks,
+                     const std::vector<int>& StrideAs,
+                     const std::vector<int>& StrideBs,
+                     const std::vector<int>& StrideCs,
+                     int kbatch = 1) const
+    {
+        std::size_t n_groups = Ms.size();
+        EXPECT_TRUE(Ns.size() == n_groups && Ks.size() == n_groups && StrideAs.size() == n_groups &&
+                    StrideBs.size() == n_groups && StrideCs.size() == n_groups)
+            << "The number of groups is not consistent!";
+
+        std::vector<tensor_operation::device::GemmDesc> gemm_descs;
+
+        for(std::size_t i = 0; i < n_groups; ++i)
+        {
+            gemm_descs.push_back(tensor_operation::device::GemmDesc{
+                Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});
+        }
+
+        std::vector<const void*> p_As(n_groups, nullptr);
+        std::vector<const void*> p_Bs(n_groups, nullptr);
+        std::vector<void*> p_Cs(n_groups, nullptr);
+        auto p_Ds = std::vector<std::array<const void*, 0>>{};
+
+        auto ggemm_instance = DeviceGroupedGemmSplitKInstance{};
+        auto argument       = ggemm_instance.MakeArgument(
+            p_As, p_Bs, p_Ds, p_Cs, gemm_descs, PassThrough{}, PassThrough{}, PassThrough{});
+        if(kbatch > 1)
+        {
+            ggemm_instance.SetKBatchSize(argument, kbatch);
+        }
+
+        return ggemm_instance.IsSupportedArgument(argument);
+    }
+
+    float Run(const std::vector<int>& Ms,
+              const std::vector<int>& Ns,
+              const std::vector<int>& Ks,
+              const std::vector<int>& StrideAs,
+              const std::vector<int>& StrideBs,
+              const std::vector<int>& StrideCs,
+              int kbatch = 1) const
+    {
+        std::size_t n_groups = Ms.size();
+        EXPECT_TRUE(Ns.size() == n_groups && Ks.size() == n_groups && StrideAs.size() == n_groups &&
+                    StrideBs.size() == n_groups && StrideCs.size() == n_groups)
+            << "The number of groups is not consistent!";
+
+        std::vector<tensor_operation::device::GemmDesc> gemm_descs;
+
+        for(std::size_t i = 0; i < n_groups; ++i)
+        {
+            gemm_descs.push_back(tensor_operation::device::GemmDesc{
+                Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});
+        }
+
+        std::vector<const void*> p_As(n_groups, nullptr);
+        std::vector<const void*> p_Bs(n_groups, nullptr);
+        std::vector<void*> p_Cs(n_groups, nullptr);
+        auto p_Ds = std::vector<std::array<const void*, 0>>{};
+
+        auto ggemm_instance = DeviceGroupedGemmSplitKInstance{};
+        auto argument       = ggemm_instance.MakeArgument(
+            p_As, p_Bs, p_Ds, p_Cs, gemm_descs, PassThrough{}, PassThrough{}, PassThrough{});
+        if(kbatch > 1)
+        {
+            ggemm_instance.SetKBatchSize(argument, kbatch);
+        }
+
+        EXPECT_TRUE(ggemm_instance.IsSupportedArgument(argument));
+        auto invoker = ggemm_instance.MakeInvoker();
+        DeviceMem gemm_desc_workspace(ggemm_instance.GetWorkSpaceSize(&argument));
+        ggemm_instance.SetWorkSpacePointer(&argument, gemm_desc_workspace.GetDeviceBuffer());
+        return invoker.Run(argument, StreamConfig{nullptr, false});
+    }
+};
+
+} // namespace test
+} // namespace ck
diff --git a/test/magic_number_division/magic_number_division.cpp b/test/magic_number_division/magic_number_division.cpp
index 680fddf1933611dc3088cb673ece0dba9e1f3911..253f21e91f80c96c081108b47b045c3902b36926 100644
--- a/test/magic_number_division/magic_number_division.cpp
+++ b/test/magic_number_division/magic_number_division.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/test/normalization/test_groupnorm_fp16.cpp b/test/normalization/test_groupnorm_fp16.cpp
index 60d3b13959fc835e4d42244bfff5437561f3178a..325ea75fe5668897a5b489216db042180b14a8ef 100644
--- a/test/normalization/test_groupnorm_fp16.cpp
+++ b/test/normalization/test_groupnorm_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_groupnorm_impl.hpp"
diff --git a/test/normalization/test_groupnorm_fp32.cpp b/test/normalization/test_groupnorm_fp32.cpp
index 3542f73a62f05f690f12270b44c92816816743c3..ec88442fc02623767f299de36f8b6c5810569cc6 100644
--- a/test/normalization/test_groupnorm_fp32.cpp
+++ b/test/normalization/test_groupnorm_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_groupnorm_impl.hpp"
diff --git a/test/normalization/test_layernorm2d_fp16.cpp b/test/normalization/test_layernorm2d_fp16.cpp
index d627cbe7f1187e904ce8bfa452274af509708101..2222740fcceaa4b9dfe22a299dc0cd3e054e5d35 100644
--- a/test/normalization/test_layernorm2d_fp16.cpp
+++ b/test/normalization/test_layernorm2d_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_layernorm_impl.hpp"
diff --git a/test/normalization/test_layernorm2d_fp32.cpp b/test/normalization/test_layernorm2d_fp32.cpp
index de4133aa8369b08be2fe3c3741081d121ceb59f6..30fbe06c60d8ebfcbdd87b3e27c11c601d81635b 100644
--- a/test/normalization/test_layernorm2d_fp32.cpp
+++ b/test/normalization/test_layernorm2d_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_layernorm_impl.hpp"
diff --git a/test/pool_fwd/test_avg_pool2d_fwd.cpp b/test/pool_fwd/test_avg_pool2d_fwd.cpp
index 4e5f1e0e919dc41403cacd46e7619e8e6093f5fc..72749fd6e1563f9b432a61871982bc7f6a2e9661 100644
--- a/test/pool_fwd/test_avg_pool2d_fwd.cpp
+++ b/test/pool_fwd/test_avg_pool2d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_pool2d_fwd_impl.hpp"
diff --git a/test/pool_fwd/test_avg_pool3d_fwd.cpp b/test/pool_fwd/test_avg_pool3d_fwd.cpp
index 0d6b105b1b9aaaadb4ff98d4ad325f3850aa747e..00cc3740fa5cc775d43bf60300b693789c7e2d4d 100644
--- a/test/pool_fwd/test_avg_pool3d_fwd.cpp
+++ b/test/pool_fwd/test_avg_pool3d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_pool3d_fwd_impl.hpp"
diff --git a/test/pool_fwd/test_max_pool2d_fwd.cpp b/test/pool_fwd/test_max_pool2d_fwd.cpp
index d16ac7fabac44156e7cca553a62228f397f9093f..1cf1314f4682acb528a30013011b1cdb21ea965f 100644
--- a/test/pool_fwd/test_max_pool2d_fwd.cpp
+++ b/test/pool_fwd/test_max_pool2d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_pool2d_fwd_impl.hpp"
diff --git a/test/pool_fwd/test_max_pool3d_fwd.cpp b/test/pool_fwd/test_max_pool3d_fwd.cpp
index f084dd9cbf9f0cffce33947c63c2f721442264ef..0b0de4d90ec57aa57db1d8a173d419f879a1ea87 100644
--- a/test/pool_fwd/test_max_pool3d_fwd.cpp
+++ b/test/pool_fwd/test_max_pool3d_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "profiler/profile_pool3d_fwd_impl.hpp"
diff --git a/test/pool_fwd/test_pool_fwd_common.hpp b/test/pool_fwd/test_pool_fwd_common.hpp
index a78785171d9dc53da9f85782bb312176e1012bd6..f018635170d9b3eb2235563fd04faefec38fea11 100644
--- a/test/pool_fwd/test_pool_fwd_common.hpp
+++ b/test/pool_fwd/test_pool_fwd_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
 #include "ck/ck.hpp"
diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp
index 3f4d0676b4da84751dd74cd4e76266059c056d67..1ab452442d0b219820e208420a4c4c04a019e9a1 100644
--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <getopt.h>
 
diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp
index c616a68e741e14119d70e0d1d4ba508e371dea78..0301669c5a7578b7c6a0c6ba160685b13295b492 100644
--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <getopt.h>
 
diff --git a/test/reference_conv_fwd/reference_conv_fwd.cpp b/test/reference_conv_fwd/reference_conv_fwd.cpp
index 1f9ba0064cb3b22c394c75d5119f71a188d11a5a..b3328e4b365268af641a6ecb1a65ded872f94ddd 100644
--- a/test/reference_conv_fwd/reference_conv_fwd.cpp
+++ b/test/reference_conv_fwd/reference_conv_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cmath>
 #include <cstdlib>
diff --git a/test/softmax/test_softmax_interface.cpp b/test/softmax/test_softmax_interface.cpp
index 8cac0ba0f52b035dae2fb52f0b0346d3267e5b7b..25f666f0ea99eb12e0fa248fdd573aebed0de7a6 100644
--- a/test/softmax/test_softmax_interface.cpp
+++ b/test/softmax/test_softmax_interface.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <stdexcept>
 #include <vector>
diff --git a/test/softmax/test_softmax_rank3.cpp b/test/softmax/test_softmax_rank3.cpp
index 5691ee3f6cd5be1bd8ff1075cb57fcfd86aa8322..24ad912d8d740c70ae00de54b6516866cc8b752a 100644
--- a/test/softmax/test_softmax_rank3.cpp
+++ b/test/softmax/test_softmax_rank3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <stdexcept>
diff --git a/test/softmax/test_softmax_rank4.cpp b/test/softmax/test_softmax_rank4.cpp
index f0b22df25ebdfaaecbf4548ec2b44fdcd477aae4..b58301fb1121d968e6d419a636b76b08735b912e 100644
--- a/test/softmax/test_softmax_rank4.cpp
+++ b/test/softmax/test_softmax_rank4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <stdexcept>
diff --git a/test/softmax/test_softmax_util.hpp b/test/softmax/test_softmax_util.hpp
index 40b300cf9927de3556c586805923de9ab97db466..e36231de8523279cbb2fd6cd0a98036a6d807b6e 100644
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/test/space_filling_curve/space_filling_curve.cpp b/test/space_filling_curve/space_filling_curve.cpp
index c7f6759e819512f55c17463ffdb503e55bb8c59e..a192ecb28f1b6c67fa7ec27790349e041c6718fb 100644
--- a/test/space_filling_curve/space_filling_curve.cpp
+++ b/test/space_filling_curve/space_filling_curve.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <vector>
 #include <iostream>
diff --git a/test/wmma_op/wmma_op.cpp b/test/wmma_op/wmma_op.cpp
index 761c15f1dd8a9d22de14a2998d73a3877b1f3003..47d8c7ed6f35236192cc9c4cfacde2e754560241 100644
--- a/test/wmma_op/wmma_op.cpp
+++ b/test/wmma_op/wmma_op.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/test/wmma_op/wmma_op_util.hpp b/test/wmma_op/wmma_op_util.hpp
index c70e6a407de6fce20df64dfa29f9aa19de1d9e30..49782bce6e21d4a250b0d07526abf4e39ac57b9d 100644
--- a/test/wmma_op/wmma_op_util.hpp
+++ b/test/wmma_op/wmma_op_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once