Composable kernel init integration v3 (#1097)

* Squashed 'src/composable_kernel/' content from commit f6edda61 git-subtree-dir: src/composable_kernel git-subtree-split: f6edda61 * add solver ConvIgemmFwdV6r1DlopsNchwKcyxNkhw; rename static ck source files * Squashed 'src/composable_kernel/' changes from f6edda61..5781adf5 5781adf5 Update develop (#5) (#6) 97e6d514 Merge pull request #4 from ROCmSoftwarePlatform/separate_online_compile 7b1ec41e refactor 49c33aae refactor 54b3e73d rename git-subtree-dir: src/composable_kernel git-subtree-split: 5781adf5 * fix * refactor * remove online compilation from CK * refactor * fix * add ctest * add c-style pointer cast * vector/scalar pointer cast use c-style pointer cast instead of reinterpret_cast * fix clang warning suppression * tidy * suppress cppcheck * fix enum issue * revert chagnes to hip build * fix kernel filename * update CK build script * rename * rename * make innner product compatiable on gfx900 * Update src/include/miopen/solver/ck_utility_common.hpp Co-authored-by: JD <Jehandad.Khan@amd.com> * compiler parameter use stream * use int instead of index_t in kernel wrapper * DynamicBuffer, StaticBuffer, amd_buffer_load support customized value for invalid element * refactor * refactor * change cmakelist * change ck common utility * fix Co-authored-by: JD <Jehandad.Khan@amd.com>

Composable kernel init integration v3 (#1097)
* Squashed 'src/composable_kernel/' content from commit f6edda61 git-subtree-dir: src/composable_kernel git-subtree-split: f6edda61 * add solver ConvIgemmFwdV6r1DlopsNchwKcyxNkhw; rename static ck source files * Squashed 'src/composable_kernel/' changes from f6edda61..5781adf5 5781adf5 Update develop (#5) (#6) 97e6d514 Merge pull request #4 from ROCmSoftwarePlatform/separate_online_compile 7b1ec41e refactor 49c33aae refactor 54b3e73d rename git-subtree-dir: src/composable_kernel git-subtree-split: 5781adf5 * fix * refactor * remove online compilation from CK * refactor * fix * add ctest * add c-style pointer cast * vector/scalar pointer cast use c-style pointer cast instead of reinterpret_cast * fix clang warning suppression * tidy * suppress cppcheck * fix enum issue * revert chagnes to hip build * fix kernel filename * update CK build script * rename * rename * make innner product compatiable on gfx900 * Update src/include/miopen/solver/ck_utility_common.hpp Co-authored-by: JD <Jehandad.Khan@amd.com> * compiler parameter use stream * use int instead of index_t in kernel wrapper * DynamicBuffer, StaticBuffer, amd_buffer_load support customized value for invalid element * refactor * refactor * change cmakelist * change ck common utility * fix Co-authored-by: JD <Jehandad.Khan@amd.com>
6fe3627a · Chao Liu · GitHub · 6fe3627a · 6fe3627a · 6fe3627a
Commit 6fe3627a authored Aug 19, 2021 by Chao Liu Committed by GitHub Aug 19, 2021
20 changed files
--- a/.clang-format
+++ b/.clang-format
+---
+Language:        Cpp
+AccessModifierOffset: 0
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:   
+  AfterClass:      true
+  AfterControlStatement: true
+  AfterEnum:       true
+  AfterFunction:   true
+  AfterNamespace:  false
+  AfterObjCDeclaration: true
+  AfterStruct:     true
+  AfterUnion:      true
+  BeforeCatch:     true
+  BeforeElse:      true
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+ColumnLimit:     100
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeCategories: 
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IndentCaseLabels: false
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    false
+SpaceAfterCStyleCast: false
+# SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: Never
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
+...
--- a/.clang-tidy
+++ b/.clang-tidy
+CheckOptions:
+  - key: bugprone-reserved-identifier.AllowedIdentifiers
+    value: '__HIP_PLATFORM_HCC__;__HIP_ROCclr__'
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+cmake_minimum_required(VERSION 3.5)
+project(composable_kernel)
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+include(CheckCXXCompilerFlag)
+## C++
+enable_language(CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
+## OpenMP
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+	# workaround issue hipcc in rocm3.5 cannot find openmp
+	set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
+	set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
+	set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
+	set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+	set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+	set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+else()
+	find_package(OpenMP REQUIRED)
+endif()
+message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
+message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
+message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
+message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+link_libraries(${OpenMP_gomp_LIBRARY})
+link_libraries(${OpenMP_pthread_LIBRARY})
+## HIP
+find_package(HIP REQUIRED)
+message(STATUS "Build with HIP ${hip_VERSION}")
+## half
+#find_path(HALF_INCLUDE_DIR half.hpp)
+message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
+# CMAKE_CXX_FLAGS
+SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
+if(BUILD_DEV)
+    string(APPEND CMAKE_CXX_FLAGS " -Werror -Weverything")
+endif()
+message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+## tidy
+include(EnableCompilerWarnings)
+set(MIOPEN_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
+if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
+    set(MIOPEN_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter)
+# Enable tidy on hip
+elseif(MIOPEN_BACKEND STREQUAL "HIP" OR MIOPEN_BACKEND STREQUAL "HIPNOGPU")
+    set(MIOPEN_TIDY_ERRORS ALL)
+endif()
+include(ClangTidy)
+enable_clang_tidy(
+    CHECKS
+        *
+        -abseil-*
+        -android-cloexec-fopen
+        # Yea we shouldn't be using rand()
+        -cert-msc30-c
+        -bugprone-exception-escape
+        -bugprone-macro-parentheses
+        -cert-env33-c
+        -cert-msc32-c
+        -cert-msc50-cpp
+        -cert-msc51-cpp
+        -cert-dcl37-c
+        -cert-dcl51-cpp
+        -clang-analyzer-alpha.core.CastToStruct
+        -clang-analyzer-optin.performance.Padding
+        -clang-diagnostic-deprecated-declarations
+        -clang-diagnostic-extern-c-compat
+        -clang-diagnostic-unused-command-line-argument
+        -cppcoreguidelines-avoid-c-arrays
+        -cppcoreguidelines-avoid-magic-numbers
+        -cppcoreguidelines-explicit-virtual-functions
+        -cppcoreguidelines-init-variables
+        -cppcoreguidelines-macro-usage
+        -cppcoreguidelines-non-private-member-variables-in-classes
+        -cppcoreguidelines-pro-bounds-array-to-pointer-decay
+        -cppcoreguidelines-pro-bounds-constant-array-index
+        -cppcoreguidelines-pro-bounds-pointer-arithmetic
+        -cppcoreguidelines-pro-type-member-init
+        -cppcoreguidelines-pro-type-reinterpret-cast
+        -cppcoreguidelines-pro-type-union-access
+        -cppcoreguidelines-pro-type-vararg
+        -cppcoreguidelines-special-member-functions
+        -fuchsia-*
+        -google-explicit-constructor
+        -google-readability-braces-around-statements
+        -google-readability-todo
+        -google-runtime-int
+        -google-runtime-references
+        -hicpp-vararg
+        -hicpp-braces-around-statements
+        -hicpp-explicit-conversions
+        -hicpp-named-parameter
+        -hicpp-no-array-decay
+        # We really shouldn't use bitwise operators with signed integers, but
+        # opencl leaves us no choice
+        -hicpp-avoid-c-arrays
+        -hicpp-signed-bitwise
+        -hicpp-special-member-functions
+        -hicpp-uppercase-literal-suffix
+        -hicpp-use-auto
+        -hicpp-use-equals-default
+        -hicpp-use-override
+        -llvm-header-guard
+        -llvm-include-order
+        #-llvmlibc-*
+        -llvmlibc-restrict-system-libc-headers
+        -llvmlibc-callee-namespace
+        -llvmlibc-implementation-in-namespace
+        -llvm-else-after-return
+        -llvm-qualified-auto
+        -misc-misplaced-const
+        -misc-non-private-member-variables-in-classes
+        -misc-no-recursion
+        -modernize-avoid-bind
+        -modernize-avoid-c-arrays
+        -modernize-pass-by-value
+        -modernize-use-auto
+        -modernize-use-default-member-init
+        -modernize-use-equals-default
+        -modernize-use-trailing-return-type
+        -modernize-use-transparent-functors
+        -performance-unnecessary-value-param
+        -readability-braces-around-statements
+        -readability-else-after-return
+        # we are not ready to use it, but very useful
+        -readability-function-cognitive-complexity
+        -readability-isolate-declaration
+        -readability-magic-numbers
+        -readability-named-parameter
+        -readability-uppercase-literal-suffix
+        -readability-convert-member-functions-to-static
+        -readability-qualified-auto
+        -readability-redundant-string-init
+        # too many narrowing conversions in our code
+        -bugprone-narrowing-conversions
+        -cppcoreguidelines-narrowing-conversions
+        -altera-struct-pack-align
+        -cppcoreguidelines-prefer-member-initializer
+        ${MIOPEN_TIDY_CHECKS}
+        ${MIOPEN_TIDY_ERRORS}
+    HEADER_FILTER
+        "\.hpp$"
+    EXTRA_ARGS
+        -DMIOPEN_USE_CLANG_TIDY
+)
+include(CppCheck)
+enable_cppcheck(
+    CHECKS
+        warning
+        style
+        performance
+        portability
+    SUPPRESS
+        ConfigurationNotChecked
+        constStatement
+        duplicateCondition
+        noExplicitConstructor
+        passedByValue
+        preprocessorErrorDirective
+        shadowVariable
+        unusedFunction
+        unusedPrivateFunction
+        unusedStructMember
+        unmatchedSuppression
+    FORCE
+    SOURCES
+        host/host_tensor/src
+        host/driver_offline/src
+        composable_kernel/src/kernel_wrapper
+    INCLUDE
+        host/host_tensor/include
+        host/solver/include
+        host/driver_offline/include
+        composable_kernel/include/*
+        ${CMAKE_CURRENT_SOURCE_DIR}/include
+        ${CMAKE_CURRENT_BINARY_DIR}/include
+    DEFINE
+        CPPCHECK=1
+        __linux__=1
+)
+add_subdirectory(host)
--- a/README.md
+++ b/README.md
+# How to build and run
+# Docker
+```
+docker run                                                                   \
+-it                                                                          \
+--rm                                                                         \
+--privileged                                                                 \
+--group-add sudo                                                             \
+-w /root/workspace                                                           \
+-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
+rocm/tensorflow:rocm4.2-tf2.4-dev                                            \
+/bin/bash
+```
+# Install Boost for online compilation
+https://www.boost.org/doc/libs/1_66_0/more/getting_started/unix-variants.html#easy-build-and-install
+# Build
+Add path of Boost
+```
+ export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+```
+```
+mkdir build && cd build
+```
+cmake cmd. Need to Specify target ID, example below is gfx908
+```
+cmake                                                                                                                              \
+-D CMAKE_BUILD_TYPE=Release                                                                                                                    \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 -O3 --amdgpu-target=gfx908 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
+-D HIP_ONLINE_COMPILER_FLAGS="-DCK_AMD_GPU_GFX908"                                                                                             \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                                      \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                                 \
+-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                              \
+..
+```
+Build drivers:   \
+``conv_fwd_driver_offline`` is (offline compilation) driver for forward convolution,  \
+``conv_bwd_driver_offline`` is (offline compilation) driver for backward-data convolution  \
+``conv_fwd_driver_online`` is (online compilation) driver for forward convolution
+```
+ make -j conv_fwd_driver_offline
+ make -j conv_bwd_driver_offline
+ make -j conv_fwd_driver_online
+```
+# Run
+* layout: 0 = NCHW; 1 = NHWC
+* algo: algorithm
+* verify: 0 = no verification; 1 = do verification
+* init: 0 ~ 5. initialization method
+* log: 0 = no log; 1 = do log
+* repeat: number of time kernel being launched
+```
+######################################################## layout  algo  verify  init  log  repeat  N__ K___ C___ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
+ ./host/driver_offline/conv_fwd_driver_offline                0     4       0     0    0       1  128  256  192 3 3  71   71     2 2       1 1      1 1       1 1
+ ./host/driver_offline/conv_fwd_driver_offline                0     4       0     0    0       1  256 1024  256 3 3  14   14     1 1       1 1      1 1       1 1
+ ./host/driver_offline/conv_fwd_driver_offline                1     5       0     0    0       1  128  256  192 3 3  71   71     2 2       1 1      1 1       1 1
+ ./host/driver_offline/conv_fwd_driver_offline                1     5       0     0    0       1  256 1024  256 3 3  14   14     1 1       1 1      1 1       1 1
+ ./host/driver_offline/conv_bwd_driver_offline                1     5       0     0    0       1  256  256 1024 3 3  14   14     1 1       1 1      1 1       1 1
+```
+# Result
+Forward convoltuion, FP16, NCHW
+```
+./host/driver_offline/conv_fwd_driver_offline                0     4       0     0    0       1  128  256  192 3 3  71   71     2 2       1 1      1 1       1 1
+layout: 0
+in: dim 4, lengths {128, 192, 71, 71}, strides {967872, 5041, 71, 1}
+wei: dim 4, lengths {256, 192, 3, 3}, strides {1728, 9, 3, 1}
+out: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1296, 36, 1}
+InLeftPads size 2, {1, 1, }
+InRightPads size 2, {1, 1, }
+ConvStrides size 2, {2, 2, }
+ConvDilations size 2, {1, 1, }
+device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
+a_k0_m_k1_grid_desc{216, 256, 8}
+b_k0_n_k1_grid_desc{216, 165888, 8}
+c_m_n_grid_desc{ 256, 165888}
+launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
+Warm up
+Start running 1 times...
+Average time : 1.4155 ms, 103.686 TFlop/s
+```
+Forward convoltuion, FP16, NCHW
+```
+ ./host/driver_offline/conv_fwd_driver_offline                0     4       0     0    0       1  256 1024  256 3 3  14   14     1 1       1 1      1 1       1 1
+ layout: 0
+in: dim 4, lengths {256, 256, 14, 14}, strides {50176, 196, 14, 1}
+wei: dim 4, lengths {1024, 256, 3, 3}, strides {2304, 9, 3, 1}
+out: dim 4, lengths {256, 1024, 14, 14}, strides {200704, 196, 14, 1}
+InLeftPads size 2, {1, 1, }
+InRightPads size 2, {1, 1, }
+ConvStrides size 2, {1, 1, }
+ConvDilations size 2, {1, 1, }
+device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
+a_k0_m_k1_grid_desc{288, 1024, 8}
+b_k0_n_k1_grid_desc{288, 50176, 8}
+c_m_n_grid_desc{ 1024, 50176}
+launch_and_time_kernel: grid_dim {1568, 1, 1}, block_dim {256, 1, 1}
+Warm up
+Start running 1 times...
+Average time : 2.21357 ms, 106.959 TFlop/s
+ ```
+ Forward convolution, FP16, NHWC
+ ```
+ ./host/driver_offline/conv_fwd_driver_offline                1     5       0     0    0       1  128  256  192 3 3  71   71     2 2       1 1      1 1       1 1
+ layout: 1
+in: dim 4, lengths {128, 71, 71, 192}, strides {967872, 13632, 192, 1}
+wei: dim 4, lengths {256, 3, 3, 192}, strides {1728, 576, 192, 1}
+out: dim 4, lengths {128, 36, 36, 256}, strides {331776, 9216, 256, 1}
+InLeftPads size 2, {1, 1, }
+InRightPads size 2, {1, 1, }
+ConvStrides size 2, {2, 2, }
+ConvDilations size 2, {1, 1, }
+device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
+a_k0_m_k1_grid_desc{216, 165888, 8}
+b_k0_n_k1_grid_desc{216, 256, 8}
+c_m_n_grid_desc{ 165888, 256}
+launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
+Warm up
+Start running 1 times...
+Average time : 1.12014 ms, 131.025 TFlop/s
+ ```
+ Forward convolution, FP16, NHWC
+ ```
+ ./host/driver_offline/conv_fwd_driver_offline                1     5       0     0    0       1  256 1024  256 3 3  14   14     1 1       1 1      1 1       1 1
+ layout: 1
+in: dim 4, lengths {256, 14, 14, 256}, strides {50176, 3584, 256, 1}
+wei: dim 4, lengths {1024, 3, 3, 256}, strides {2304, 768, 256, 1}
+out: dim 4, lengths {256, 14, 14, 1024}, strides {200704, 14336, 1024, 1}
+InLeftPads size 2, {1, 1, }
+InRightPads size 2, {1, 1, }
+ConvStrides size 2, {1, 1, }
+ConvDilations size 2, {1, 1, }
+device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
+a_k0_m_k1_grid_desc{288, 50176, 8}
+b_k0_n_k1_grid_desc{288, 1024, 8}
+c_m_n_grid_desc{ 50176, 1024}
+launch_and_time_kernel: grid_dim {1568, 1, 1}, block_dim {256, 1, 1}
+Warm up
+Start running 1 times...
+Average time : 1.86877 ms, 126.693 TFlop/s
+ ```
+ Backward data convolution, FP16, NHWC
+ ```
+ ./host/driver_offline/conv_bwd_driver_offline       1     1       0     3    0       1  256  256 1024 3 3  14   14     1 1       1 1      1 1       1 1
+ layout: 1
+in: dim 4, lengths {256, 14, 14, 1024}, strides {200704, 14336, 1024, 1}
+wei: dim 4, lengths {256, 3, 3, 1024}, strides {9216, 3072, 1024, 1}
+out: dim 4, lengths {256, 14, 14, 256}, strides {50176, 3584, 256, 1}
+InLeftPads size 2, {1, 1, }
+InRightPads size 2, {1, 1, }
+ConvStrides size 2, {1, 1, }
+ConvDilations size 2, {1, 1, }
+device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
+a_k0_m_k1_grid_desc{288, 50176, 8}
+b_k0_n_k1_grid_desc{288, 1024, 8}
+c_m_n_grid_desc{ 50176, 1024}
+launch_and_time_kernel: grid_dim {1568, 1, 1}, block_dim {256, 1, 1}
+Warm up
+Start running 1 times...
+Average time : 2.22461 ms, 106.428 TFlop/s
+```
--- a/cmake/Analyzers.cmake
+++ b/cmake/Analyzers.cmake
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+if(NOT TARGET analyze)
+    add_custom_target(analyze)
+endif()
+function(mark_as_analyzer)
+    add_dependencies(analyze ${ARGN})
+endfunction()
--- a/cmake/ClangTidy.cmake
+++ b/cmake/ClangTidy.cmake
+################################################################################
+#
+# MIT License
+#
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+include(CMakeParseArguments)
+include(Analyzers)
+get_filename_component(CLANG_TIDY_EXE_HINT "${CMAKE_CXX_COMPILER}" PATH)
+find_program(CLANG_TIDY_EXE
+    NAMES
+        clang-tidy
+        clang-tidy-5.0
+        clang-tidy-4.0
+        clang-tidy-3.9
+        clang-tidy-3.8
+        clang-tidy-3.7
+        clang-tidy-3.6
+        clang-tidy-3.5
+    HINTS
+        ${CLANG_TIDY_EXE_HINT}
+    PATH_SUFFIXES
+        compiler/bin
+    PATHS
+        /opt/rocm/llvm/bin
+        /opt/rocm/hcc
+        /usr/local/opt/llvm/bin
+)
+function(find_clang_tidy_version VAR)
+    execute_process(COMMAND ${CLANG_TIDY_EXE} -version OUTPUT_VARIABLE VERSION_OUTPUT)
+    separate_arguments(VERSION_OUTPUT_LIST UNIX_COMMAND "${VERSION_OUTPUT}")
+    list(FIND VERSION_OUTPUT_LIST "version" VERSION_INDEX)
+    if(VERSION_INDEX GREATER 0)
+        math(EXPR VERSION_INDEX "${VERSION_INDEX} + 1")
+        list(GET VERSION_OUTPUT_LIST ${VERSION_INDEX} VERSION)
+        set(${VAR} ${VERSION} PARENT_SCOPE)
+    else()
+        set(${VAR} "0.0" PARENT_SCOPE)
+    endif()
+endfunction()
+if( NOT CLANG_TIDY_EXE )
+    message( STATUS "Clang tidy not found" )
+    set(CLANG_TIDY_VERSION "0.0")
+else()
+    find_clang_tidy_version(CLANG_TIDY_VERSION)
+    message( STATUS "Clang tidy found: ${CLANG_TIDY_VERSION}")
+endif()
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CLANG_TIDY_FIXIT_DIR ${CMAKE_BINARY_DIR}/fixits)
+file(MAKE_DIRECTORY ${CLANG_TIDY_FIXIT_DIR})
+set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${CLANG_TIDY_FIXIT_DIR})
+macro(enable_clang_tidy)
+    set(options ANALYZE_TEMPORARY_DTORS ALL)
+    set(oneValueArgs HEADER_FILTER)
+    set(multiValueArgs CHECKS ERRORS EXTRA_ARGS)
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    string(REPLACE ";" "," CLANG_TIDY_CHECKS "${PARSE_CHECKS}")
+    string(REPLACE ";" "," CLANG_TIDY_ERRORS "${PARSE_ERRORS}")
+    set(CLANG_TIDY_EXTRA_ARGS)
+    foreach(ARG ${PARSE_EXTRA_ARGS})
+        list(APPEND CLANG_TIDY_EXTRA_ARGS "-extra-arg=${ARG}")
+    endforeach()
+    set(CLANG_TIDY_ALL)
+    if(PARSE_ALL)
+        set(CLANG_TIDY_ALL ALL)
+    endif()
+    message(STATUS "Clang tidy checks: ${CLANG_TIDY_CHECKS}")
+    if (${PARSE_ANALYZE_TEMPORARY_DTORS})
+        set(CLANG_TIDY_ANALYZE_TEMPORARY_DTORS "-analyze-temporary-dtors")
+    endif()
+    if (${CLANG_TIDY_VERSION} VERSION_LESS "3.9.0")
+        set(CLANG_TIDY_ERRORS_ARG "")
+    else()
+        set(CLANG_TIDY_ERRORS_ARG "-warnings-as-errors='${CLANG_TIDY_ERRORS}'")
+    endif()
+    if (${CLANG_TIDY_VERSION} VERSION_LESS "3.9.0")
+        set(CLANG_TIDY_QUIET_ARG "")
+    else()
+        set(CLANG_TIDY_QUIET_ARG "-quiet")
+    endif()
+    if(PARSE_HEADER_FILTER)
+        string(REPLACE "$" "$$" CLANG_TIDY_HEADER_FILTER "${PARSE_HEADER_FILTER}")
+    else()
+        set(CLANG_TIDY_HEADER_FILTER ".*")
+    endif()
+    set(CLANG_TIDY_COMMAND
+        ${CLANG_TIDY_EXE}
+        ${CLANG_TIDY_QUIET_ARG}
+        -p ${CMAKE_BINARY_DIR}
+        -checks='${CLANG_TIDY_CHECKS}'
+        ${CLANG_TIDY_ERRORS_ARG}
+        ${CLANG_TIDY_EXTRA_ARGS}
+        ${CLANG_TIDY_ANALYZE_TEMPORARY_DTORS}
+        -header-filter='${CLANG_TIDY_HEADER_FILTER}'
+    )
+    add_custom_target(tidy ${CLANG_TIDY_ALL})
+    mark_as_analyzer(tidy)
+    add_custom_target(tidy-base)
+    add_custom_target(tidy-make-fixit-dir COMMAND ${CMAKE_COMMAND} -E make_directory ${CLANG_TIDY_FIXIT_DIR})
+    add_custom_target(tidy-rm-fixit-dir COMMAND ${CMAKE_COMMAND} -E remove_directory ${CLANG_TIDY_FIXIT_DIR})
+    add_dependencies(tidy-make-fixit-dir tidy-rm-fixit-dir)
+    add_dependencies(tidy-base tidy-make-fixit-dir)
+endmacro()
+function(clang_tidy_check TARGET)
+    get_target_property(SOURCES ${TARGET} SOURCES)
+    # TODO: Use generator expressions instead
+    # COMMAND ${CLANG_TIDY_COMMAND} $<TARGET_PROPERTY:${TARGET},SOURCES>
+    # COMMAND ${CLANG_TIDY_COMMAND} $<JOIN:$<TARGET_PROPERTY:${TARGET},SOURCES>, >
+    foreach(SOURCE ${SOURCES})
+        if((NOT "${SOURCE}" MATCHES "(h|hpp|hxx)$") AND (NOT "${SOURCE}" MATCHES "TARGET_OBJECTS"))
+            string(MAKE_C_IDENTIFIER "${SOURCE}" tidy_file)
+            set(tidy_target tidy-target-${TARGET}-${tidy_file})
+            add_custom_target(${tidy_target}
+                # for some targets clang-tidy not able to get information from .clang-tidy
+                DEPENDS ${SOURCE}
+                COMMAND ${CLANG_TIDY_COMMAND} "-config=\{CheckOptions: \[\{key: bugprone-reserved-identifier.AllowedIdentifiers,value: __HIP_PLATFORM_HCC__\; __HIP_ROCclr__\}\]\}" ${SOURCE} "-export-fixes=${CLANG_TIDY_FIXIT_DIR}/${TARGET}-${tidy_file}.yaml"
+                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+                COMMENT "clang-tidy: Running clang-tidy on target ${SOURCE}..."
+            )
+            add_dependencies(${tidy_target} ${TARGET})
+            add_dependencies(${tidy_target} tidy-base)
+            add_dependencies(tidy ${tidy_target})
+        endif()
+    endforeach()
+endfunction()
--- a/cmake/CppCheck.cmake
+++ b/cmake/CppCheck.cmake
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+include(CMakeParseArguments)
+include(ProcessorCount)
+include(Analyzers)
+find_program(CPPCHECK_EXE 
+    NAMES 
+        cppcheck
+    PATHS
+        /opt/rocm/bin
+)
+ProcessorCount(CPPCHECK_JOBS)
+set(CPPCHECK_BUILD_DIR ${CMAKE_BINARY_DIR}/cppcheck-build)
+file(MAKE_DIRECTORY ${CPPCHECK_BUILD_DIR})
+set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${CPPCHECK_BUILD_DIR})
+macro(enable_cppcheck)
+    set(options FORCE)
+    set(oneValueArgs)
+    set(multiValueArgs CHECKS SUPPRESS DEFINE UNDEFINE INCLUDE SOURCES)
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    string(REPLACE ";" "," CPPCHECK_CHECKS "${PARSE_CHECKS}")
+    string(REPLACE ";" "\n" CPPCHECK_SUPPRESS "${PARSE_SUPPRESS};*:/usr/*")
+    file(WRITE ${CMAKE_BINARY_DIR}/cppcheck-supressions "${CPPCHECK_SUPPRESS}")
+    set(CPPCHECK_DEFINES)
+    foreach(DEF ${PARSE_DEFINE})
+        set(CPPCHECK_DEFINES "${CPPCHECK_DEFINES} -D${DEF}")
+    endforeach()
+    set(CPPCHECK_UNDEFINES)
+    foreach(DEF ${PARSE_UNDEFINE})
+        set(CPPCHECK_UNDEFINES "${CPPCHECK_UNDEFINES} -U${DEF}")
+    endforeach()
+    set(CPPCHECK_INCLUDES)
+    foreach(INC ${PARSE_INCLUDE})
+        set(CPPCHECK_INCLUDES "${CPPCHECK_INCLUDES} -I${INC}")
+    endforeach()
+    # set(CPPCHECK_FORCE)
+    set(CPPCHECK_FORCE "--project=${CMAKE_BINARY_DIR}/compile_commands.json")
+    if(PARSE_FORCE)
+        set(CPPCHECK_FORCE --force)
+    endif()
+    set(SOURCES)
+    set(GLOBS)
+    foreach(SOURCE ${PARSE_SOURCES})
+        get_filename_component(ABS_SOURCE ${SOURCE} ABSOLUTE)
+        if(EXISTS ${ABS_SOURCE})
+            if(IS_DIRECTORY ${ABS_SOURCE})
+                set(GLOBS "${GLOBS} ${ABS_SOURCE}/*.cpp ${ABS_SOURCE}/*.hpp ${ABS_SOURCE}/*.cxx ${ABS_SOURCE}/*.c ${ABS_SOURCE}/*.h")
+            else()
+                set(SOURCES "${SOURCES} ${ABS_SOURCE}")
+            endif()
+        else()
+            set(GLOBS "${GLOBS} ${ABS_SOURCE}")
+        endif()
+    endforeach()
+    file(WRITE ${CMAKE_BINARY_DIR}/cppcheck.cmake "
+        file(GLOB_RECURSE GSRCS ${GLOBS})
+        set(CPPCHECK_COMMAND
+            ${CPPCHECK_EXE}
+            -q
+            # -v
+            # --report-progress
+            ${CPPCHECK_FORCE}
+            --cppcheck-build-dir=${CPPCHECK_BUILD_DIR}
+            --platform=native
+            --template=gcc
+            --error-exitcode=1
+            -j ${CPPCHECK_JOBS}
+            ${CPPCHECK_DEFINES}
+            ${CPPCHECK_UNDEFINES}
+            ${CPPCHECK_INCLUDES}
+            --enable=${CPPCHECK_CHECKS}
+            --inline-suppr
+            --suppressions-list=${CMAKE_BINARY_DIR}/cppcheck-supressions
+            ${SOURCES} \${GSRCS}
+        )
+        string(REPLACE \";\" \" \" CPPCHECK_SHOW_COMMAND \"\${CPPCHECK_COMMAND}\")
+        message(\"\${CPPCHECK_SHOW_COMMAND}\")
+        execute_process(
+            COMMAND \${CPPCHECK_COMMAND}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+            RESULT_VARIABLE RESULT
+        )
+        if(NOT RESULT EQUAL 0)
+            message(FATAL_ERROR \"Cppcheck failed\")
+        endif()
+")
+    add_custom_target(cppcheck
+        COMMAND ${CMAKE_COMMAND} -P ${CMAKE_BINARY_DIR}/cppcheck.cmake
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "cppcheck: Running cppcheck..."
+    )
+    mark_as_analyzer(cppcheck)
+endmacro()
--- a/cmake/DoxygenDoc.cmake
+++ b/cmake/DoxygenDoc.cmake
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+include(CMakeParseArguments)
+include(MainDoc)
+find_program(DOXYGEN_EXECUTABLE NAMES doxygen
+    PATH_SUFFIXES bin
+    DOC "Doxygen documentation generator"
+)
+mark_as_advanced(DOXYGEN_EXECUTABLE)
+find_path(DOT_EXECUTABLE NAMES dot
+    PATH_SUFFIXES bin
+    DOC "Graphviz"
+)
+mark_as_advanced(DOT_EXECUTABLE)
+set(DOXYGEN_ARGS
+ABBREVIATE_BRIEF
+ALIASES
+ALLEXTERNALS
+ALLOW_UNICODE_NAMES
+ALPHABETICAL_INDEX
+ALWAYS_DETAILED_SEC
+AUTOLINK_SUPPORT
+BINARY_TOC
+BRIEF_MEMBER_DESC
+BUILTIN_STL_SUPPORT
+CALLER_GRAPH
+CALL_GRAPH
+CASE_SENSE_NAMES
+CHM_FILE
+CHM_INDEX_ENCODING
+CITE_BIB_FILES
+CLANG_ASSISTED_PARSING
+CLANG_OPTIONS
+CLASS_DIAGRAMS
+CLASS_GRAPH
+COLLABORATION_GRAPH
+COLS_IN_ALPHA_INDEX
+COMPACT_LATEX
+COMPACT_RTF
+CPP_CLI_SUPPORT
+CREATE_SUBDIRS
+DIAFILE_DIRS
+DIA_PATH
+DIRECTORY_GRAPH
+DISABLE_INDEX
+DISTRIBUTE_GROUP_DOC
+DOCBOOK_OUTPUT
+DOCBOOK_PROGRAMLISTING
+DOCSET_BUNDLE_ID
+DOCSET_FEEDNAME
+DOCSET_PUBLISHER_ID
+DOCSET_PUBLISHER_NAME
+DOTFILE_DIRS
+DOT_CLEANUP
+DOT_FONTNAME
+DOT_FONTPATH
+DOT_FONTSIZE
+DOT_GRAPH_MAX_NODES
+DOT_IMAGE_FORMAT
+DOT_MULTI_TARGETS
+DOT_NUM_THREADS
+# DOT_PATH
+DOT_TRANSPARENT
+DOXYFILE_ENCODING
+ECLIPSE_DOC_ID
+ENABLED_SECTIONS
+ENABLE_PREPROCESSING
+ENUM_VALUES_PER_LINE
+EXAMPLE_PATH
+EXAMPLE_PATTERNS
+EXAMPLE_RECURSIVE
+EXCLUDE
+EXCLUDE_PATTERNS
+EXCLUDE_SYMBOLS
+EXCLUDE_SYMLINKS
+EXPAND_AS_DEFINED
+EXPAND_ONLY_PREDEF
+EXTENSION_MAPPING
+EXTERNAL_GROUPS
+EXTERNAL_PAGES
+EXTERNAL_SEARCH
+EXTERNAL_SEARCH_ID
+EXTRACT_ALL
+EXTRACT_ANON_NSPACES
+EXTRACT_LOCAL_CLASSES
+EXTRACT_LOCAL_METHODS
+EXTRACT_PACKAGE
+EXTRACT_PRIVATE
+EXTRACT_STATIC
+EXTRA_PACKAGES
+EXTRA_SEARCH_MAPPINGS
+EXT_LINKS_IN_WINDOW
+FILE_PATTERNS
+FILE_VERSION_FILTER
+FILTER_PATTERNS
+FILTER_SOURCE_FILES
+FILTER_SOURCE_PATTERNS
+FORCE_LOCAL_INCLUDES
+FORMULA_FONTSIZE
+FORMULA_TRANSPARENT
+FULL_PATH_NAMES
+GENERATE_AUTOGEN_DEF
+GENERATE_BUGLIST
+GENERATE_CHI
+GENERATE_DEPRECATEDLIST
+GENERATE_DOCBOOK
+GENERATE_DOCSET
+GENERATE_ECLIPSEHELP
+GENERATE_HTML
+GENERATE_HTMLHELP
+GENERATE_LATEX
+GENERATE_LEGEND
+GENERATE_MAN
+GENERATE_PERLMOD
+GENERATE_QHP
+GENERATE_RTF
+GENERATE_TAGFILE
+GENERATE_TESTLIST
+GENERATE_TODOLIST
+GENERATE_TREEVIEW
+GENERATE_XML
+GRAPHICAL_HIERARCHY
+GROUP_GRAPHS
+GROUP_NESTED_COMPOUNDS
+# HAVE_DOT
+HHC_LOCATION
+HIDE_COMPOUND_REFERENCE
+HIDE_FRIEND_COMPOUNDS
+HIDE_IN_BODY_DOCS
+HIDE_SCOPE_NAMES
+HIDE_UNDOC_CLASSES
+HIDE_UNDOC_MEMBERS
+HIDE_UNDOC_RELATIONS
+HTML_COLORSTYLE_GAMMA
+HTML_COLORSTYLE_HUE
+HTML_COLORSTYLE_SAT
+HTML_DYNAMIC_SECTIONS
+HTML_EXTRA_FILES
+HTML_EXTRA_STYLESHEET
+HTML_FILE_EXTENSION
+HTML_FOOTER
+HTML_HEADER
+HTML_INDEX_NUM_ENTRIES
+HTML_OUTPUT
+HTML_STYLESHEET
+HTML_TIMESTAMP
+IDL_PROPERTY_SUPPORT
+IGNORE_PREFIX
+IMAGE_PATH
+INCLUDED_BY_GRAPH
+INCLUDE_FILE_PATTERNS
+INCLUDE_GRAPH
+INCLUDE_PATH
+INHERIT_DOCS
+INLINE_GROUPED_CLASSES
+INLINE_INFO
+INLINE_INHERITED_MEMB
+INLINE_SIMPLE_STRUCTS
+INLINE_SOURCES
+INPUT
+INPUT_ENCODING
+INPUT_FILTER
+INTERACTIVE_SVG
+INTERNAL_DOCS
+JAVADOC_AUTOBRIEF
+LATEX_BATCHMODE
+LATEX_BIB_STYLE
+LATEX_CMD_NAME
+LATEX_EXTRA_FILES
+LATEX_EXTRA_STYLESHEET
+LATEX_FOOTER
+LATEX_HEADER
+LATEX_HIDE_INDICES
+LATEX_OUTPUT
+LATEX_SOURCE_CODE
+LATEX_TIMESTAMP
+LAYOUT_FILE
+LOOKUP_CACHE_SIZE
+MACRO_EXPANSION
+MAKEINDEX_CMD_NAME
+MAN_EXTENSION
+MAN_LINKS
+MAN_OUTPUT
+MAN_SUBDIR
+MARKDOWN_SUPPORT
+MATHJAX_CODEFILE
+MATHJAX_EXTENSIONS
+MATHJAX_FORMAT
+MATHJAX_RELPATH
+MAX_DOT_GRAPH_DEPTH
+MAX_INITIALIZER_LINES
+MSCFILE_DIRS
+MSCGEN_PATH
+MULTILINE_CPP_IS_BRIEF
+OPTIMIZE_FOR_FORTRAN
+OPTIMIZE_OUTPUT_FOR_C
+OPTIMIZE_OUTPUT_JAVA
+OPTIMIZE_OUTPUT_VHDL
+OUTPUT_DIRECTORY
+OUTPUT_LANGUAGE
+PAPER_TYPE
+PDF_HYPERLINKS
+PERLMOD_LATEX
+PERLMOD_MAKEVAR_PREFIX
+PERLMOD_PRETTY
+PERL_PATH
+PLANTUML_CFG_FILE
+PLANTUML_INCLUDE_PATH
+PLANTUML_JAR_PATH
+PREDEFINED
+PROJECT_BRIEF
+PROJECT_LOGO
+PROJECT_NAME
+PROJECT_NUMBER
+QCH_FILE
+QHG_LOCATION
+QHP_CUST_FILTER_ATTRS
+QHP_CUST_FILTER_NAME
+QHP_NAMESPACE
+QHP_SECT_FILTER_ATTRS
+QHP_VIRTUAL_FOLDER
+QT_AUTOBRIEF
+QUIET
+RECURSIVE
+REFERENCED_BY_RELATION
+REFERENCES_LINK_SOURCE
+REFERENCES_RELATION
+REPEAT_BRIEF
+RTF_EXTENSIONS_FILE
+RTF_HYPERLINKS
+RTF_OUTPUT
+RTF_SOURCE_CODE
+RTF_STYLESHEET_FILE
+SEARCHDATA_FILE
+SEARCHENGINE
+SEARCHENGINE_URL
+SEARCH_INCLUDES
+SEPARATE_MEMBER_PAGES
+SERVER_BASED_SEARCH
+SHORT_NAMES
+SHOW_FILES
+SHOW_GROUPED_MEMB_INC
+SHOW_INCLUDE_FILES
+SHOW_NAMESPACES
+SHOW_USED_FILES
+SIP_SUPPORT
+SKIP_FUNCTION_MACROS
+SORT_BRIEF_DOCS
+SORT_BY_SCOPE_NAME
+SORT_GROUP_NAMES
+SORT_MEMBERS_CTORS_1ST
+SORT_MEMBER_DOCS
+SOURCE_BROWSER
+SOURCE_TOOLTIPS
+STRICT_PROTO_MATCHING
+STRIP_CODE_COMMENTS
+STRIP_FROM_INC_PATH
+STRIP_FROM_PATH
+SUBGROUPING
+TAB_SIZE
+TAGFILES
+TCL_SUBST
+TEMPLATE_RELATIONS
+TOC_EXPAND
+TOC_INCLUDE_HEADINGS
+TREEVIEW_WIDTH
+TYPEDEF_HIDES_STRUCT
+UML_LIMIT_NUM_FIELDS
+UML_LOOK
+USE_HTAGS
+USE_MATHJAX
+USE_MDFILE_AS_MAINPAGE
+USE_PDFLATEX
+VERBATIM_HEADERS
+WARNINGS
+WARN_AS_ERROR
+WARN_FORMAT
+WARN_IF_DOC_ERROR
+WARN_IF_UNDOCUMENTED
+WARN_LOGFILE
+WARN_NO_PARAMDOC
+XML_OUTPUT
+XML_PROGRAMLISTING
+)
+set(DOXYGEN_CONFIG_FILE "${CMAKE_CURRENT_BINARY_DIR}/doxygen/doxygen.conf" CACHE PATH "Path to generated doxygen configuration file")
+function(add_doxygen_doc)
+    set(options)
+    set(oneValueArgs)
+    set(multiValueArgs DEPENDS ${DOXYGEN_ARGS})
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    file(WRITE ${DOXYGEN_CONFIG_FILE} "# Auto-generated doxygen configuration file\n")
+    foreach(ARG ${DOXYGEN_ARGS})
+        if(PARSE_${ARG})
+            string(REPLACE ";" " " ARG_VALUE ${PARSE_${ARG}})
+            file(APPEND ${DOXYGEN_CONFIG_FILE} "\n${ARG} = ${ARG_VALUE}\n")
+        endif()
+    endforeach()
+    if(PARSE_OUTPUT_DIRECTORY)
+        if(NOT EXISTS ${PARSE_OUTPUT_DIRECTORY})
+            file(MAKE_DIRECTORY ${PARSE_OUTPUT_DIRECTORY})
+        endif()
+    endif()
+    if(DOT_EXECUTABLE)
+        file(APPEND ${DOXYGEN_CONFIG_FILE} "\nDOT_PATH = \"${DOT_EXECUTABLE}\"\n")
+        file(APPEND ${DOXYGEN_CONFIG_FILE} "\nHAVE_DOT = YES\n")
+    else()
+        file(APPEND ${DOXYGEN_CONFIG_FILE} "\nHAVE_DOT = NO\n")
+    endif()
+    add_custom_target(doxygen
+        ${DOXYGEN_EXECUTABLE} ${DOXYGEN_CONFIG_FILE}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "Building documentation with doxygen"
+    )
+    if(PARSE_OUTPUT_DIRECTORY)
+        clean_doc_output(${PARSE_OUTPUT_DIRECTORY})
+    endif()
+    mark_as_doc(doxygen)
+    if(PARSE_DEPENDS)
+        add_dependencies(doxygen ${PARSE_DEPENDS})
+    endif()
+endfunction()
--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
+################################################################################
+#
+# MIT License
+#
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+# - Enable warning all for gcc/clang or use /W4 for visual studio
+## Strict warning level
+if (MSVC)
+    # Use the highest warning level for visual studio.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /w")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /w")
+    # set(CMAKE_CXX_WARNING_LEVEL 4)
+    # if (CMAKE_CXX_FLAGS MATCHES "/W[0-4]")
+    #     string(REGEX REPLACE "/W[0-4]" "/W4" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    # else ()
+    #     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
+    # endif ()
+    # set(CMAKE_C_WARNING_LEVEL 4)
+    # if (CMAKE_C_FLAGS MATCHES "/W[0-4]")
+    #     string(REGEX REPLACE "/W[0-4]" "/W4" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+    # else ()
+    #     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W4")
+    # endif ()
+else()
+    foreach(COMPILER C CXX)
+        set(CMAKE_COMPILER_WARNINGS)
+        # use -Wall for gcc and clang
+        list(APPEND CMAKE_COMPILER_WARNINGS
+            -Wall
+            -Wextra
+            -Wcomment
+            -Wendif-labels
+            -Wformat
+            -Winit-self
+            -Wreturn-type
+            -Wsequence-point
+            # Shadow is broken on gcc when using lambdas
+            # -Wshadow
+            -Wswitch
+            -Wtrigraphs
+            -Wundef
+            -Wuninitialized
+            -Wunreachable-code
+            -Wunused
+            -Wno-sign-compare
+            -Wno-extra-semi-stmt
+        )
+        if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "Clang")
+            list(APPEND CMAKE_COMPILER_WARNINGS
+                -Weverything
+                -Wno-c++98-compat
+                -Wno-c++98-compat-pedantic
+                -Wno-conversion
+                -Wno-double-promotion
+                -Wno-exit-time-destructors
+                -Wno-extra-semi
+                -Wno-float-conversion
+                -Wno-gnu-anonymous-struct
+                -Wno-gnu-zero-variadic-macro-arguments
+                -Wno-missing-prototypes
+                -Wno-nested-anon-types
+                -Wno-padded
+                -Wno-return-std-move-in-c++11
+                -Wno-shorten-64-to-32
+                -Wno-sign-conversion
+                -Wno-unknown-warning-option
+                -Wno-unused-command-line-argument
+                -Wno-weak-vtables
+                -Wno-covered-switch-default
+            )
+        else()
+            if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "GNU" AND ${COMPILER} MATCHES "CXX")
+                # cmake 3.5.2 does not support >=.
+                if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.1")
+                    list(APPEND CMAKE_COMPILER_WARNINGS
+                        -Wno-ignored-attributes)
+                endif()
+            endif()
+            list(APPEND CMAKE_COMPILER_WARNINGS
+                -Wno-missing-field-initializers
+                -Wno-deprecated-declarations
+            )
+        endif()
+        add_definitions(${CMAKE_COMPILER_WARNINGS})
+    endforeach()
+endif ()
--- a/composable_kernel/include/gridwise_operation_wrapper.hpp
+++ b/composable_kernel/include/gridwise_operation_wrapper.hpp
+#ifndef CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
+#define CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
+template <typename GridwiseOp, typename... Xs>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        run_gridwise_operation(Xs... xs)
+{
+    GridwiseOp{}.Run(xs...);
+}
+#endif
--- a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
+#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+namespace ck {
+// Number of GEMMs = YTilda * XTilda
+// GemmM = C
+// GemmN = N * HTildaSlice * WTildaSlice
+// GemmK = K * YDotSlice * XDotSlice
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t IYTildaValue,
+          index_t IXTildaValue,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<IYTildaValue>,
+    Number<IXTildaValue>,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto GemmK1  = Number<GemmK1Value>{};
+    constexpr auto IYTilda = Number<IYTildaValue>{};
+    constexpr auto IXTilda = Number<IXTildaValue>{};
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+    const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+    const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+    const auto YTilda = ConvStrideH / GcdStrideDilationH;
+    const auto XTilda = ConvStrideW / GcdStrideDilationW;
+    const auto YDot = math::integer_divide_ceil(Y, YTilda);
+    const auto XDot = math::integer_divide_ceil(X, XTilda);
+    const auto HTilda = Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+    const auto WTilda = Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+    // only work on HTilda and WTilda that contribute to non-padding area of input tensor
+    const auto IHTildaSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadH - ConvDilationH * (YTilda - I1)), ConvStrideH);
+    const auto IWTildaSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadW - ConvDilationW * (XTilda - I1)), ConvStrideW);
+    const auto IHTildaSliceEnd =
+        math::min(HTilda, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+    const auto IWTildaSliceEnd =
+        math::min(WTilda, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+    const auto HTildaSlice = IHTildaSliceEnd - IHTildaSliceBegin;
+    const auto WTildaSlice = IWTildaSliceEnd - IWTildaSliceBegin;
+    // GemmK is different for each GEMM
+    const auto YDotSlice = math::integer_divide_ceil(Y - IYTilda, YTilda);
+    const auto XDotSlice = math::integer_divide_ceil(X - IXTilda, XTilda);
+    const auto K1 = GemmK1;
+    const auto K0 = K / K1;
+    // weight tensor
+    const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_tensor_descriptor(
+        wei_k_y_x_c_grid_desc,
+        make_tuple(make_pass_through_transform(K),
+                   make_embed_transform(make_tuple(YDot, YTilda),
+                                        make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, XTilda),
+                                        make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+    const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
+        transform_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                               make_slice_transform(YDot, I0, YDotSlice),
+                                               make_slice_transform(XDot, I0, XDotSlice),
+                                               make_freeze_transform(IYTilda),
+                                               make_freeze_transform(IXTilda),
+                                               make_pass_through_transform(C)),
+                                    make_tuple(Sequence<0>{},
+                                               Sequence<1>{},
+                                               Sequence<3>{},
+                                               Sequence<2>{},
+                                               Sequence<4>{},
+                                               Sequence<5>{}),
+                                    make_tuple(Sequence<0, 1>{},
+                                               Sequence<2>{},
+                                               Sequence<3>{},
+                                               Sequence<>{},
+                                               Sequence<>{},
+                                               Sequence<4>{}));
+#if 1
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<0, 2, 3>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+    // output tensor
+    // this add padding check
+    const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
+        out_n_ho_wo_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Ho, I0, I0),
+                   make_pad_transform(Wo, I0, I0),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+    const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_tensor_descriptor(
+        out_n_hop_wop_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YDot, HTilda),
+                                        make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, WTilda),
+                                        make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+    const auto out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc =
+        transform_tensor_descriptor(
+            out_n_ydot_htilda_xdot_wtilda_k_grid_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_slice_transform(YDot, I0, YDotSlice),
+                       make_slice_transform(HTilda, IHTildaSliceBegin, HTildaSlice),
+                       make_slice_transform(XDot, I0, XDotSlice),
+                       make_slice_transform(WTilda, IWTildaSliceBegin, WTildaSlice),
+                       make_unmerge_transform(make_tuple(K0, K1))),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5>{}),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5, 6>{}));
+#if 1
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+        out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+        out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<5, 1, 3>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+    // input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+    const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YTilda, HTilda),
+                                        make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(XTilda, WTilda),
+                                        make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+    const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_tensor_descriptor(
+        in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_freeze_transform(IYTilda),
+                   make_slice_transform(HTilda, IHTildaSliceBegin, HTildaSlice),
+                   make_freeze_transform(IXTilda),
+                   make_slice_transform(WTilda, IWTildaSliceBegin, WTildaSlice),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{},
+                   Sequence<1>{},
+                   Sequence<2>{},
+                   Sequence<3>{},
+                   Sequence<4>{},
+                   Sequence<5>{}),
+        make_tuple(Sequence<0>{},
+                   Sequence<>{},
+                   Sequence<1>{},
+                   Sequence<>{},
+                   Sequence<2>{},
+                   Sequence<3>{}));
+    const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        in_n_htildaslice_wtildaslice_c_grid_desc,
+        make_tuple(make_pass_through_transform(C),
+                   make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice))),
+        make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
+                      out_gemmk0_gemmn_gemmk1_grid_desc,
+                      in_gemmm_gemmn_grid_desc);
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
+#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+namespace ck {
+// A: out
+// B: wei
+// C: in
+// Number of GEMMs = YTilda * XTilda
+// GemmM = N * HTildaSlice * WTildaSlice
+// GemmN = C
+// GemmK = K * YDotSlice * XDotSlice
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t IYTildaValue,
+          index_t IXTildaValue,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<IYTildaValue>,
+    Number<IXTildaValue>,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto GemmK1  = Number<GemmK1Value>{};
+    constexpr auto IYTilda = Number<IYTildaValue>{};
+    constexpr auto IXTilda = Number<IXTildaValue>{};
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+    const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+    const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+    const auto YTilda = ConvStrideH / GcdStrideDilationH;
+    const auto XTilda = ConvStrideW / GcdStrideDilationW;
+    const auto YDot = math::integer_divide_ceil(Y, YTilda);
+    const auto XDot = math::integer_divide_ceil(X, XTilda);
+    const auto HTilda = Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+    const auto WTilda = Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+    // only work on HTilda and WTilda that contribute to non-padding area of input tensor
+    const auto IHTildaSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadH - ConvDilationH * (YTilda - I1)), ConvStrideH);
+    const auto IWTildaSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadW - ConvDilationW * (XTilda - I1)), ConvStrideW);
+    const auto IHTildaSliceEnd =
+        math::min(HTilda, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+    const auto IWTildaSliceEnd =
+        math::min(WTilda, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+    const auto HTildaSlice = IHTildaSliceEnd - IHTildaSliceBegin;
+    const auto WTildaSlice = IWTildaSliceEnd - IWTildaSliceBegin;
+    // GemmK is different for each GEMM
+    const auto YDotSlice = math::integer_divide_ceil(Y - IYTilda, YTilda);
+    const auto XDotSlice = math::integer_divide_ceil(X - IXTilda, XTilda);
+    const auto K1 = GemmK1;
+    const auto K0 = K / K1;
+    // A: output tensor
+    // this add padding check
+    const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
+        out_n_ho_wo_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Ho, I0, I0),
+                   make_pad_transform(Wo, I0, I0),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+    const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_tensor_descriptor(
+        out_n_hop_wop_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YDot, HTilda),
+                                        make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, WTilda),
+                                        make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+    const auto out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc =
+        transform_tensor_descriptor(
+            out_n_ydot_htilda_xdot_wtilda_k_grid_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_slice_transform(YDot, I0, YDotSlice),
+                       make_slice_transform(HTilda, IHTildaSliceBegin, HTildaSlice),
+                       make_slice_transform(XDot, I0, XDotSlice),
+                       make_slice_transform(WTilda, IWTildaSliceBegin, WTildaSlice),
+                       make_unmerge_transform(make_tuple(K0, K1))),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5>{}),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5, 6>{}));
+#if 1
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+        out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+        out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<5, 1, 3>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+    // B: weight tensor
+    const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_tensor_descriptor(
+        wei_k_y_x_c_grid_desc,
+        make_tuple(make_pass_through_transform(K),
+                   make_embed_transform(make_tuple(YDot, YTilda),
+                                        make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, XTilda),
+                                        make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+    const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
+        transform_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                               make_slice_transform(YDot, I0, YDotSlice),
+                                               make_slice_transform(XDot, I0, XDotSlice),
+                                               make_freeze_transform(IYTilda),
+                                               make_freeze_transform(IXTilda),
+                                               make_pass_through_transform(C)),
+                                    make_tuple(Sequence<0>{},
+                                               Sequence<1>{},
+                                               Sequence<3>{},
+                                               Sequence<2>{},
+                                               Sequence<4>{},
+                                               Sequence<5>{}),
+                                    make_tuple(Sequence<0, 1>{},
+                                               Sequence<2>{},
+                                               Sequence<3>{},
+                                               Sequence<>{},
+                                               Sequence<>{},
+                                               Sequence<4>{}));
+#if 1
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<0, 2, 3>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+    // C: input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+    const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YTilda, HTilda),
+                                        make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(XTilda, WTilda),
+                                        make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+    const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_tensor_descriptor(
+        in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_freeze_transform(IYTilda),
+                   make_slice_transform(HTilda, IHTildaSliceBegin, HTildaSlice),
+                   make_freeze_transform(IXTilda),
+                   make_slice_transform(WTilda, IWTildaSliceBegin, WTildaSlice),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{},
+                   Sequence<1>{},
+                   Sequence<2>{},
+                   Sequence<3>{},
+                   Sequence<4>{},
+                   Sequence<5>{}),
+        make_tuple(Sequence<0>{},
+                   Sequence<>{},
+                   Sequence<1>{},
+                   Sequence<>{},
+                   Sequence<2>{},
+                   Sequence<3>{}));
+    const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        in_n_htildaslice_wtildaslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+    return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                      wei_gemmk0_gemmn_gemmk1_grid_desc,
+                      in_gemmm_gemmn_grid_desc);
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+namespace ck {
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
+    const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
+    const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
+    const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+    // weight tensor
+    const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+    // input tensor
+    const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
+        in_n_c_hi_wi_global_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+    const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
+        in_n_c_hip_wip_global_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+    const auto in_gemmk_gemmn_global_desc =
+        transform_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
+                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+    // output tensor
+    const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+    return make_tuple(
+        wei_gemmk_gemmm_global_desc, in_gemmk_gemmn_global_desc, out_gemmm_gemmn_global_desc);
+}
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad(
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
+    const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
+    const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+    assert(InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 && InRightPadW == 0);
+    // weight tensor
+    const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+    // input tensor
+    const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
+        in_n_c_hi_wi_global_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+    const auto in_gemmk_gemmn_global_desc =
+        transform_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
+                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+    // output tensor
+    const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+    return make_tuple(
+        wei_gemmk_gemmm_global_desc, in_gemmk_gemmn_global_desc, out_gemmm_gemmn_global_desc);
+}
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_1x1(
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
+    const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
+    const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+    assert(Y == 1 && X == 1 && ConvStrideH == 1 && ConvStrideW == 1 && ConvDilationH == 1 &&
+           ConvDilationW == 1 && InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 &&
+           InRightPadW == 0);
+    // weight tensor
+    const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+    // input tensor
+    const auto in_gemmk_gemmn_global_desc = transform_tensor_descriptor(
+        in_n_c_hi_wi_global_desc,
+        make_tuple(make_pass_through_transform(C), make_merge_transform(make_tuple(N, Ho, Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2, 3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+    // output tensor
+    const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+    return make_tuple(
+        wei_gemmk_gemmm_global_desc, in_gemmk_gemmn_global_desc, out_gemmm_gemmn_global_desc);
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NHWC_KYXC_NHWK_HPP
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+namespace ck {
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_pad(
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+    // weight tensor
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+    // input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+    const auto in_gemmk_gemmn_grid_desc =
+        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+    // output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+    return make_tuple(
+        wei_gemmk_gemmm_grid_desc, in_gemmk_gemmn_grid_desc, out_gemmm_gemmn_grid_desc);
+}
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_1x1(
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+    assert(Y == 1 && X == 1 && ConvStrideH == 1 && ConvStrideW == 1 && ConvDilationH == 1 &&
+           ConvDilationW == 1 && InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 &&
+           InRightPadW == 0);
+    // weight tensor
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+    // input tensor
+    const auto in_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, C)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+    // output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+    return make_tuple(
+        wei_gemmk_gemmm_grid_desc, in_gemmk_gemmn_grid_desc, out_gemmm_gemmn_grid_desc);
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+namespace ck {
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+    const auto N = in_n_c_hi_wi_grid_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_grid_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_grid_desc.GetLength(I1);
+    const auto Hi = in_n_c_hi_wi_grid_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_grid_desc.GetLength(I3);
+    const auto Ho = out_n_k_ho_wo_grid_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_grid_desc.GetLength(I3);
+    const auto Y = wei_k_c_y_x_grid_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_grid_desc.GetLength(I3);
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+    const auto GemmM  = K;
+    const auto GemmN  = N * Ho * Wo;
+    const auto GemmK  = C * Y * X;
+    const auto GemmK0 = GemmK / GemmK1;
+    // weight tensor
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(wei_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    // input tensor
+    const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
+        in_n_c_hi_wi_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+    const auto in_n_c_y_ho_x_wo_grid_desc = transform_tensor_descriptor(
+        in_n_c_hip_wip_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+    const auto in_gemmk_gemmn_grid_desc =
+        transform_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc =
+        transform_tensor_descriptor(in_gemmk_gemmn_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    // output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
+                      in_gemmk0_gemmn_gemmk1_grid_desc,
+                      out_gemmm_gemmn_grid_desc);
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NHWC_KYXC_NHWK_HPP
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+namespace ck {
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+    const auto GemmM  = K;
+    const auto GemmN  = N * Ho * Wo;
+    const auto GemmK  = C * Y * X;
+    const auto GemmK0 = GemmK / GemmK1;
+    // weight tensor
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(wei_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    // input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+    const auto in_gemmk_gemmn_grid_desc =
+        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc =
+        transform_tensor_descriptor(in_gemmk_gemmn_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    // output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
+                      in_gemmk0_gemmn_gemmk1_grid_desc,
+                      out_gemmm_gemmn_grid_desc);
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+namespace ck {
+// A: in
+// B: wei
+// C: out
+// GemmM = N * Ho * Wo
+// GemmN = K
+// GemmK = Y * X * C
+template <typename... In,
+          typename... Wei,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+    const auto GemmM  = N * Ho * Wo;
+    const auto GemmN  = K;
+    const auto GemmK  = Y * X * C;
+    const auto GemmK0 = GemmK / GemmK1;
+    // A: input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+    const auto in_gemmk_gemmm_grid_desc =
+        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+    const auto in_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(in_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    // B: weight tensor
+    const auto wei_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc =
+        transform_tensor_descriptor(wei_gemmk_gemmn_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    // C: output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+    return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                      wei_gemmk0_gemmn_gemmk1_grid_desc,
+                      out_gemmm_gemmn_grid_desc);
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+namespace ck {
+// GemmM0 = 1
+// GemmM1 = K
+// GemmN0 = N0
+// GemmN1 = (N / N0) * Ho * Wo
+// GemmK0 = (C / C0) * Y * X
+// GemmK1 = C0
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          typename N0Type,
+          typename C0Type>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const N0Type& N0,
+    const C0Type& C0)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    const auto N = in_n_c_hi_wi_grid_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_grid_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_grid_desc.GetLength(I1);
+    const auto Hi = in_n_c_hi_wi_grid_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_grid_desc.GetLength(I3);
+    const auto Ho = out_n_k_ho_wo_grid_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_grid_desc.GetLength(I3);
+    const auto Y = wei_k_c_y_x_grid_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_grid_desc.GetLength(I3);
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+    const auto N1 = N / N0;
+    const auto C1 = C / C0;
+    // weight tensor
+    const auto wei_gk0_gm0_gm1_gk1_grid_desc =
+        transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
+                                    make_tuple(make_unmerge_transform(make_tuple(I1, K)),
+                                               make_unmerge_transform(make_tuple(C0, C1 * Y * X))),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<1, 2>{}, Sequence<3, 0>{}));
+    // input tensor
+    const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
+        in_n_c_hi_wi_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+    const auto in_n0_n1_c0_c1_y_ho_x_wo_grid_desc = transform_tensor_descriptor(
+        in_n_c_hip_wip_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(N0, N1)),
+                   make_unmerge_transform(make_tuple(C0, C1)),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6, 7>{}));
+    const auto in_gk0_gn0_gn1_gk1_grid_desc = transform_tensor_descriptor(
+        in_n0_n1_c0_c1_y_ho_x_wo_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(C1, Y, X)),
+                   make_pass_through_transform(N0),
+                   make_merge_transform(make_tuple(N1, Ho, Wo)),
+                   make_pass_through_transform(C0)),
+        make_tuple(Sequence<3, 4, 6>{}, Sequence<0>{}, Sequence<1, 5, 7>{}, Sequence<2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+    // output tensor
+    const auto out_n_k_howo_grid_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo));
+    const auto out_n0_n1_1_k_howo_grid_desc =
+        transform_tensor_descriptor(out_n_k_howo_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(N0, N1)),
+                                               make_unmerge_transform(make_tuple(I1, K)),
+                                               make_pass_through_transform(Ho * Wo)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                    make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4>{}));
+    const auto out_gm0_gm1_gn0_gn1_grid_desc = transform_tensor_descriptor(
+        out_n0_n1_1_k_howo_grid_desc,
+        make_tuple(make_pass_through_transform(I1),
+                   make_pass_through_transform(K),
+                   make_pass_through_transform(N0),
+                   make_merge_transform_v2_magic_division(make_tuple(N1, Ho * Wo))),
+        make_tuple(Sequence<2>{}, Sequence<3>{}, Sequence<0>{}, Sequence<1, 4>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+    return make_tuple(
+        wei_gk0_gm0_gm1_gk1_grid_desc, in_gk0_gn0_gn1_gk1_grid_desc, out_gm0_gm1_gn0_gn1_grid_desc);
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/cluster_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/cluster_descriptor.hpp
+#ifndef CK_CLUSTER_DESCRIPTOR_HPP
+#define CK_CLUSTER_DESCRIPTOR_HPP
+#include "common_header.hpp"
+#include "tensor_adaptor.hpp"
+namespace ck {
+template <typename Lengths,
+          typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
+__host__ __device__ constexpr auto make_cluster_descriptor(
+    const Lengths& lengths,
+    ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{})
+{
+    constexpr index_t ndim_low = Lengths::Size();
+    const auto reordered_lengths = container_reorder_given_new2old(lengths, order);
+    const auto low_lengths = generate_tuple(
+        [&](auto idim_low) { return reordered_lengths[idim_low]; }, Number<ndim_low>{});
+    const auto transform = make_merge_transform(low_lengths);
+    constexpr auto low_dim_old_top_ids = ArrangeOrder{};
+    constexpr auto up_dim_new_top_ids = Sequence<0>{};
+    return make_single_stage_tensor_adaptor(
+        make_tuple(transform), make_tuple(low_dim_old_top_ids), make_tuple(up_dim_new_top_ids));
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp
+#ifndef CK_MULTI_INDEX_TRANSFORM_HPP
+#define CK_MULTI_INDEX_TRANSFORM_HPP
+#include "common_header.hpp"
+#include "multi_index.hpp"
+namespace ck {
+template <typename LowLength>
+struct PassThrough
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+    using UpLengths = decltype(make_tuple(LowLength{}));
+    UpLengths up_lengths_;
+    __host__ __device__ constexpr PassThrough() = default;
+    __host__ __device__ constexpr PassThrough(const LowLength& low_length)
+        : up_lengths_{make_tuple(low_length)}
+    {
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ static void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up)
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        idx_low(Number<0>{}) = idx_up[Number<0>{}];
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx&,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        constexpr auto I0 = Number<0>{};
+        idx_diff_low(I0) = idx_diff_up[I0];
+        idx_low += idx_diff_low;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value;
+    }
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("PassThrough, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+template <typename LowLength,
+          typename LeftPadLength,
+          typename RightPadLength,
+          bool SkipIsValidCheck = false>
+struct Pad
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+    using UpLengths = decltype(make_tuple(LowLength{} + LeftPadLength{} + RightPadLength{}));
+    UpLengths up_lengths_;
+    LeftPadLength left_pad_length_;
+    RightPadLength right_pad_length_;
+    __host__ __device__ constexpr Pad() = default;
+    __host__ __device__ constexpr Pad(const LowLength& low_length,
+                                      const LeftPadLength& left_pad_length,
+                                      const RightPadLength& right_pad_length)
+        : up_lengths_{make_tuple(low_length + left_pad_length + right_pad_length)},
+          left_pad_length_{left_pad_length},
+          right_pad_length_{right_pad_length}
+    {
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_length_;
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx&,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        constexpr auto I0 = Number<0>{};
+        idx_diff_low(I0) = idx_diff_up[I0];
+        idx_low += idx_diff_low;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return SkipIsValidCheck;
+    }
+    template <typename UpIdx>
+    __host__ __device__ constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
+    {
+        return SkipIsValidCheck ||
+               ((idx_up[Number<0>{}] >= left_pad_length_) &&
+                (idx_up[Number<0>{}] < up_lengths_[Number<0>{}] - right_pad_length_));
+    }
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<LeftPadLength>::value &&
+               is_known_at_compile_time<RightPadLength>::value;
+    }
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Pad, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("left_pad_length %d", index_t{left_pad_length_});
+        printf("right_pad_length %d", index_t{right_pad_length_});
+        printf("}");
+    }
+};
+template <typename LowLength, typename LeftPadLength, bool SkipIsValidCheck = false>
+struct LeftPad
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+    using UpLengths = decltype(make_tuple(LowLength{} + LeftPadLength{}));
+    UpLengths up_lengths_;
+    LeftPadLength left_pad_length_;
+    __host__ __device__ constexpr LeftPad() = default;
+    __host__ __device__ constexpr LeftPad(const LowLength& low_length,
+                                          const LeftPadLength& left_pad_length)
+        : up_lengths_{make_tuple(low_length + left_pad_length)}, left_pad_length_{left_pad_length}
+    {
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_length_;
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx&,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        constexpr auto I0 = Number<0>{};
+        idx_diff_low(I0) = idx_diff_up[I0];
+        idx_low += idx_diff_low;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return SkipIsValidCheck;
+    }
+    template <typename UpIdx>
+    __host__ __device__ constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
+    {
+        return SkipIsValidCheck || (idx_up[Number<0>{}] >= left_pad_length_);
+    }
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<LeftPadLength>::value;
+    }
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("LeftPad, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("left_pad_length_ %d", index_t{left_pad_length_});
+        printf("}");
+    }
+};
+template <typename LowLength, typename RightPadLength, bool SkipIsValidCheck = false>
+struct RightPad
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+    using UpLengths = decltype(make_tuple(LowLength{} + RightPadLength{}));
+    UpLengths up_lengths_;
+    LowLength low_length_;
+    RightPadLength right_pad_length_;
+    __host__ __device__ constexpr RightPad() = default;
+    __host__ __device__ constexpr RightPad(const LowLength& low_length,
+                                           const RightPadLength& right_pad_length)
+        : up_lengths_{make_tuple(low_length + right_pad_length)},
+          low_length_{low_length},
+          right_pad_length_{right_pad_length}
+    {
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ static constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                                  const UpIdx& idx_up)
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        idx_low(Number<0>{}) = idx_up[Number<0>{}];
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx&,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        constexpr auto I0 = Number<0>{};
+        idx_diff_low(I0) = idx_diff_up[I0];
+        idx_low += idx_diff_low;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return SkipIsValidCheck;
+    }
+    template <typename UpIdx>
+    __host__ __device__ constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
+    {
+        return SkipIsValidCheck || (idx_up[Number<0>{}] < low_length_);
+    }
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<LowLength>::value &&
+               is_known_at_compile_time<RightPadLength>::value;
+    }
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("RightPad, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("low_length_ %d", index_t{low_length_});
+        printf("left_pad_length_ %d", index_t{right_pad_length_});
+        printf("}");
+    }
+};
+// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1]
+// UpLengths and Coefficients can be either of the followings:
+//   1) Tuple of index_t, which is known at run-time, or
+//   2) Tuple of Number, which is known at compile-time, or
+//   3) Tuple of mixture of index_t and Number, which is known partially at run-time and partially
+//   at compile-time
+template <typename UpLengths,
+          typename Coefficients,
+          typename enable_if<UpLengths::Size() == Coefficients::Size(), bool>::type = false>
+struct Embed
+{
+    static constexpr index_t NDimUp = UpLengths::Size();
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<NDimUp>;
+    UpLengths up_lengths_;
+    Coefficients coefficients_;
+    __host__ __device__ constexpr Embed() = default;
+    __host__ __device__ constexpr Embed(const UpLengths& up_lengths,
+                                        const Coefficients& coefficients)
+        : up_lengths_{up_lengths}, coefficients_{coefficients}
+    {
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
+                      "wrong! inconsistent # of dimension");
+        idx_low(Number<0>{}) = 0;
+        static_for<0, NDimUp, 1>{}([&idx_low, &idx_up, this](auto i) {
+            idx_low(Number<0>{}) += idx_up[i] * this->coefficients_[i];
+        });
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx&,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == NDimUp &&
+                          LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
+                      "wrong! inconsistent # of dimension");
+        idx_diff_low(Number<0>{}) = 0;
+        static_for<0, NDimUp, 1>{}(
+            [&](auto i) { idx_diff_low(Number<0>{}) += idx_diff_up[i] * coefficients_[i]; });
+        idx_low += idx_diff_low;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<Coefficients>::value;
+    }
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Embed, ");
+        printf("up_lengths_ ");
+        print_multi_index(up_lengths_);
+        printf("coefficients_ ");
+        print_multi_index(coefficients_);
+        printf("}");
+    }
+};
+// Implementation of "Merge" transformation primitive that uses regular to do lowering of
+// multi-index and use carry-and-borrow check to do lowering of multi-index delta
+template <typename LowLengths>
+struct Merge_v1_carry_check
+{
+    static constexpr index_t NDimLow = LowLengths::Size();
+    using LowerIndex = MultiIndex<NDimLow>;
+    using UpperIndex = MultiIndex<1>;
+    using LowLengthsScan =
+        decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
+    using UpLengths =
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
+    LowLengths low_lengths_;
+    LowLengthsScan low_lengths_scan_;
+    UpLengths up_lengths_;
+    __host__ __device__ constexpr Merge_v1_carry_check() = default;
+    __host__ __device__ constexpr Merge_v1_carry_check(const LowLengths& low_lengths)
+        : low_lengths_{low_lengths},
+          low_lengths_scan_{
+              container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
+    {
+        static_assert(LowerIndex::Size() == NDimLow, "wrong!");
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimLow; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        index_t tmp = idx_up[Number<0>{}];
+        // normal division
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_low(i) = tmp / this->low_lengths_scan_[i];
+            tmp -= idx_low[i] * this->low_lengths_scan_[i];
+        });
+        idx_low(Number<NDimLow - 1>{}) = tmp;
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex_1a(LowIdxDiff& idx_diff_low,
+                                                 const UpIdxDiff& idx_diff_up,
+                                                 LowIdx& idx_low,
+                                                 const UpIdx& /* idx_up_new */,
+                                                 Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
+        // However,
+        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
+        //   can be calculated at compile-time.
+        //   2) If idx_diff_up is not known at compile-time, but its value
+        //   doesn't change during the whole kernel execution, then
+        //   idx_diff_low_const also
+        //   doesn't change during the whole kernel execution. Compiler generated
+        //   ISA should
+        //   only caclculate idx_diff_low_const once and save it durinng the whole
+        //   kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be
+        // computed at
+        //   run-time each time this function is called, and can be very expensive.
+        LowerIndex idx_diff_low_const;
+        LowerIndex idx_low_length_minus_idx_diff_low_const;
+        LowerIndex idx_low_length_plus_idx_diff_low_const;
+#if !CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
+            idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
+        });
+#else
+        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) =
+                __builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
+            idx_low_length_plus_idx_diff_low_const(i) =
+                __builtin_amdgcn_readfirstlane(low_lengths_[i] + idx_diff_low_const[i]);
+        });
+#endif
+        if constexpr(Hack == 1)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+                bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) += carry;
+                carry = do_carry ? 1 : 0;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+            idx_low += idx_diff_low;
+        }
+        else if constexpr(Hack == 2)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t borrow = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] - borrow;
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) -= borrow;
+                borrow = do_borrow ? 1 : 0;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
+            idx_low += idx_diff_low;
+        }
+        else
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+                bool do_carry  = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
+                idx_diff_low(i) += carry;
+                carry = do_carry ? 1 : 0;
+                carry = do_borrow ? -1 : carry;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+            idx_low += idx_diff_low;
+        }
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex_1b(LowIdxDiff& idx_diff_low,
+                                                 const UpIdxDiff& idx_diff_up,
+                                                 LowIdx& idx_low,
+                                                 const UpIdx& /* idx_up_new */,
+                                                 Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
+        // However,
+        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
+        //   can be calculated at compile-time.
+        //   2) If idx_diff_up is not known at compile-time, but its value
+        //   doesn't change during the whole kernel execution, then
+        //   idx_diff_low_const also
+        //   doesn't change during the whole kernel execution. Compiler generated
+        //   ISA should
+        //   only caclculate idx_diff_low_const once and save it durinng the whole
+        //   kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be
+        // computed at
+        //   run-time each time this function is called, and can be very expensive.
+        LowerIndex idx_diff_low_const;
+        LowerIndex idx_low_length_minus_idx_diff_low_const;
+        LowerIndex idx_low_length_plus_idx_diff_low_const;
+#if !CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
+            idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
+        });
+#else
+        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) =
+                __builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
+            idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
+        });
+#endif
+        if constexpr(Hack == 1)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+                bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) += carry;
+                carry = do_carry ? 1 : 0;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+            idx_low += idx_diff_low;
+        }
+        else if constexpr(Hack == 2)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t borrow = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t negative_idx_low_tmp = borrow - idx_low[i];
+                bool do_borrow = negative_idx_low_tmp > idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) -= borrow;
+                borrow = do_borrow ? 1 : 0;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
+            idx_low += idx_diff_low;
+        }
+        else
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+                bool do_carry  = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
+                idx_diff_low(i) += carry;
+                carry = do_carry ? 1 : 0;
+                carry = do_borrow ? -1 : carry;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+            idx_low += idx_diff_low;
+        }
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex_2(LowIdxDiff& idx_diff_low,
+                                                const UpIdxDiff& idx_diff_up,
+                                                LowIdx& idx_low,
+                                                const UpIdx& /* idx_up_new */,
+                                                Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
+        // However,
+        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
+        //   can be calculated at compile-time.
+        //   2) If idx_diff_up is not known at compile-time, but its value
+        //   doesn't change during the whole kernel execution, then
+        //   idx_diff_low_const also
+        //   doesn't change during the whole kernel execution. Compiler generated
+        //   ISA should
+        //   only caclculate idx_diff_low_const once and save it durinng the whole
+        //   kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be
+        //   computed at run-time each time this function is called, and can be
+        //   very expensive.
+        LowerIndex idx_diff_low_const;
+#if !CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
+#else
+        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
+#endif
+        if constexpr(Hack == 1)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            bool do_carry = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                idx_diff_low(i) = idx_diff_low_const[i] + do_carry;
+                index_t idx_low_tmp = idx_low[i] + idx_diff_low[i];
+                do_carry = idx_low_tmp >= low_lengths_[i];
+#if 0
+                // TODO: use exec-mask inline asm, which use 1 VALU
+                if(do_carry)
+                {
+                    idx_diff_low(i) -= low_lengths_[i];
+                }
+#elif 1
+                // this use 2 VALU
+                idx_diff_low(i) = do_carry ? idx_diff_low[i] - low_lengths_[i] : idx_diff_low[i];
+#elif 1
+                // this use 2 VALU
+                index_t idx_diff_low_tmp = idx_diff_low[i] - low_lengths_[i];
+                idx_diff_low(i)          = do_carry ? idx_diff_low_tmp : idx_diff_low[i];
+#endif
+                idx_low(i) += idx_diff_low[i];
+            });
+            constexpr auto I0 = Number<0>{};
+            idx_diff_low(I0) = idx_diff_low_const[I0] + do_carry;
+            idx_low(I0) += idx_diff_low[I0];
+        }
+        else if constexpr(Hack == 2)
+        {
+            // do borrow check on each low dimension in reversed order
+            // do not need to check the first dimension
+            bool do_borrow = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                idx_diff_low(i) = idx_diff_low_const[i] - do_borrow;
+                index_t idx_low_tmp = idx_low[i] + idx_diff_low[i];
+                do_borrow = idx_low_tmp < 0;
+#if 0
+                // TODO: use exec-mask inline asm
+                if(do_borrow)
+                {
+                    idx_diff_low(i) += low_lengths_[i];
+                }
+#elif 1
+                idx_diff_low(i) = do_borrow ? idx_diff_low[i] + low_lengths_[i] : idx_diff_low[i];
+#elif 1
+                index_t idx_diff_low_tmp = idx_diff_low[i] + low_lengths_[i];
+                idx_diff_low(i)          = do_borrow ? idx_diff_low_tmp : idx_diff_low[i];
+#endif
+                idx_low(i) += idx_diff_low[i];
+            });
+            constexpr auto I0 = Number<0>{};
+            idx_diff_low(I0) = idx_diff_low_const[I0] - do_borrow;
+            idx_low(I0) += idx_diff_low[I0];
+        }
+        else
+        {
+            // not implemented
+        }
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
+                                              Number<Hack>) const
+    {
+#if 1
+        UpdateLowerIndex_1a(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+#elif 0
+        UpdateLowerIndex_1b(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+#else
+        UpdateLowerIndex_2(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+#endif
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<LowLengths>::value &&
+               is_known_at_compile_time<LowLengthsScan>::value &&
+               is_known_at_compile_time<UpLengths>::value;
+    }
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Merge_v1_carry_check, ");
+        printf("low_lengths_ ");
+        print_multi_index(low_lengths_);
+        printf("low_lengths_scan_ ");
+        print_multi_index(low_lengths_scan_);
+        printf("up_lengths_ ");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+template <typename LowLengths>
+struct lambda_merge_generate_MagicDivision_calculate_magic_multiplier
+{
+    template <index_t I>
+    __host__ __device__ constexpr auto operator()(Number<I> i) const
+    {
+        return MagicDivision::CalculateMagicMultiplier(LowLengths{}[i]);
+    }
+};
+template <typename LowLengths>
+struct lambda_merge_generate_MagicDivision_calculate_magic_shift
+{
+    template <index_t I>
+    __host__ __device__ constexpr auto operator()(Number<I> i) const
+    {
+        return MagicDivision::CalculateMagicShift(LowLengths{}[i]);
+    }
+};
+// Implementation of "Merge" transformation primitive that uses magic-number-division to do lowering
+// of both multi-index and delta of multi-index
+// Caution:
+//   1. The magic number division implementation being used would produce correct result if the
+//   dividended is uint32_t and its value is with in 31-bit value range of uint32_t.
+//   2. The magic number division for int32_t dividened has not been implemented, the int32_t
+//   dividend would be bit-wise interpreted as uint32_t and magic number division implementation for
+//   uint32_t is then used.
+//   3. For Merge primitive, upper-index is the dividend.
+//   4. When upper-index is uint32_t, its value need to be within 31-bit range.
+//   5. When upper-index is int32_t type (when index_t is int32_t), its value need to be
+//   non-negative.
+template <typename LowLengths>
+struct Merge_v2_magic_division
+{
+    static constexpr index_t NDimLow = LowLengths::Size();
+    using LowerIndex = MultiIndex<NDimLow>;
+    using UpperIndex = MultiIndex<1>;
+    using UpLengths =
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
+    using LowLengthsMagicDivisorMultipiler = decltype(
+        generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{},
+                       Number<NDimLow>{}));
+    using LowLengthsMagicDivisorShift = decltype(
+        generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_shift<LowLengths>{},
+                       Number<NDimLow>{}));
+    LowLengths low_lengths_;
+    LowLengthsMagicDivisorMultipiler low_lengths_magic_divisor_multiplier_;
+    LowLengthsMagicDivisorShift low_lengths_magic_divisor_shift_;
+    UpLengths up_lengths_;
+    __host__ __device__ constexpr Merge_v2_magic_division() = default;
+    __host__ __device__ constexpr Merge_v2_magic_division(const LowLengths& low_lengths)
+        : low_lengths_{low_lengths},
+          low_lengths_magic_divisor_multiplier_{generate_tuple(
+              [&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths[i]); },
+              Number<NDimLow>{})},
+          low_lengths_magic_divisor_shift_{generate_tuple(
+              [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths[i]); },
+              Number<NDimLow>{})},
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
+    {
+        static_assert(LowerIndex::Size() == NDimLow, "wrong!");
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimLow; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        index_t tmp = idx_up[Number<0>{}];
+        static_for<NDimLow - 1, 0, -1>{}([&, this](auto i) {
+            index_t tmp2 =
+                MagicDivision::DoMagicDivision(tmp,
+                                               this->low_lengths_magic_divisor_multiplier_[i],
+                                               this->low_lengths_magic_divisor_shift_[i]);
+            idx_low(i) = tmp - tmp2 * this->low_lengths_[i];
+            tmp        = tmp2;
+        });
+        idx_low(Number<0>{}) = tmp;
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff&,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        index_t tmp = idx_up_new[Number<0>{}];
+        static_for<NDimLow - 1, 0, -1>{}([&, this](auto i) {
+            index_t tmp2 =
+                MagicDivision::DoMagicDivision(tmp,
+                                               this->low_lengths_magic_divisor_multiplier_[i],
+                                               this->low_lengths_magic_divisor_shift_[i]);
+            index_t idx_low_old = idx_low[i];
+            idx_low(i) = tmp - tmp2 * this->low_lengths_[i];
+            tmp        = tmp2;
+            idx_diff_low(i) = idx_low[i] - idx_low_old;
+        });
+        idx_diff_low(Number<0>{}) = tmp - idx_low(Number<0>{});
+        idx_low(Number<0>{}) = tmp;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<LowLengths>::value &&
+               is_known_at_compile_time<LowLengthsMagicDivisorMultipiler>::value &&
+               is_known_at_compile_time<LowLengthsMagicDivisorShift>::value &&
+               is_known_at_compile_time<UpLengths>::value;
+    }
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Merge_v2_magic_division, ");
+        printf("low_lengths_ ");
+        print_multi_index(low_lengths_);
+        printf("low_lengths_magic_divisor_multiplier_ ");
+        print_multi_index(low_lengths_magic_divisor_multiplier_);
+        printf("low_lengths_magic_divisor_shift_ ");
+        print_multi_index(low_lengths_magic_divisor_shift_);
+        printf("up_lengths_ ");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+// Implementation of "Merge" transformation primitive that uses magic-number-division to do lowering
+// of both multi-index and delta of multi-index
+// Caution:
+//   1. The magic number division implementation being used would produce correct result if the
+//   dividended is uint32_t and its value is with in 31-bit value range of uint32_t.
+//   2. The magic number division for int32_t dividened has not been implemented, the int32_t
+//   dividend would be bit-wise interpreted as uint32_t and magic number division implementation for
+//   uint32_t is then used.
+//   3. For Merge primitive, upper-index is the dividend.
+//   4. When upper-index is uint32_t, its value need to be within 31-bit range.
+//   5. When upper-index is int32_t type (when index_t is int32_t), its value need to be
+//   non-negative.
+template <typename LowLengths>
+struct Merge_v2r2_magic_division
+{
+    static constexpr index_t NDimLow = LowLengths::Size();
+    using LowerIndex = MultiIndex<NDimLow>;
+    using UpperIndex = MultiIndex<1>;
+    using LowLengthsScan =
+        decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
+    using UpLengths =
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
+    using LowLengthsScanMagicDivisorMultipiler = decltype(generate_tuple(
+        lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengthsScan>{},
+        Number<NDimLow>{}));
+    using LowLengthsScanMagicDivisorShift = decltype(
+        generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_shift<LowLengthsScan>{},
+                       Number<NDimLow>{}));
+    LowLengths low_lengths_;
+    LowLengthsScan low_lengths_scan_;
+    LowLengthsScanMagicDivisorMultipiler low_lengths_scan_magic_divisor_multiplier_;
+    LowLengthsScanMagicDivisorShift low_lengths_scan_magic_divisor_shift_;
+    UpLengths up_lengths_;
+    __host__ __device__ constexpr Merge_v2r2_magic_division() = default;
+    __host__ __device__ constexpr Merge_v2r2_magic_division(const LowLengths& low_lengths)
+        : low_lengths_{low_lengths},
+          low_lengths_scan_{
+              container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
+          low_lengths_scan_magic_divisor_multiplier_{generate_tuple(
+              [&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths_scan_[i]); },
+              Number<NDimLow>{})},
+          low_lengths_scan_magic_divisor_shift_{generate_tuple(
+              [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths_scan_[i]); },
+              Number<NDimLow>{})},
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
+    {
+        static_assert(LowerIndex::Size() == NDimLow, "wrong!");
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimLow; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        index_t tmp = idx_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&, this](auto i) {
+            idx_low(i) =
+                MagicDivision::DoMagicDivision(tmp,
+                                               this->low_lengths_scan_magic_divisor_multiplier_[i],
+                                               this->low_lengths_scan_magic_divisor_shift_[i]);
+            tmp -= idx_low[i] * this->low_lengths_scan_[i];
+        });
+        idx_low(Number<NDimLow - 1>{}) = tmp;
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff&,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        index_t tmp = idx_up_new[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&, this](auto i) {
+            index_t idx_low_old = idx_low[i];
+            idx_low(i) =
+                MagicDivision::DoMagicDivision(tmp,
+                                               this->low_lengths_scan_magic_divisor_multiplier_[i],
+                                               this->low_lengths_scan_magic_divisor_shift_[i]);
+            idx_diff_low(i) = idx_low[i] - idx_low_old;
+            tmp -= idx_low[i] * this->low_lengths_scan_[i];
+        });
+        idx_diff_low(Number<NDimLow - 1>{}) = tmp - idx_low[Number<NDimLow - 1>{}];
+        idx_low(Number<NDimLow - 1>{}) = tmp;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<LowLengths>::value &&
+               is_known_at_compile_time<LowLengthsScanMagicDivisorMultipiler>::value &&
+               is_known_at_compile_time<LowLengthsScanMagicDivisorShift>::value &&
+               is_known_at_compile_time<UpLengths>::value;
+    }
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Merge_v2r2_magic_division, ");
+        printf("low_lengths_ ");
+        print_multi_index(low_lengths_);
+        printf("low_lengths_scan ");
+        print_multi_index(low_lengths_scan_);
+        printf("low_lengths_scan_magic_divisor_multiplier_ ");
+        print_multi_index(low_lengths_scan_magic_divisor_multiplier_);
+        printf("low_lengths_scan_magic_divisor_shift_ ");
+        print_multi_index(low_lengths_scan_magic_divisor_shift_);
+        printf("up_lengths_ ");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+template <typename UpLengths, bool Use24BitIntegerCalculation>
+struct UnMerge
+{
+    static constexpr index_t NDimUp = UpLengths::Size();
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<NDimUp>;
+    using UpLengthsScan =
+        decltype(container_reverse_exclusive_scan(UpLengths{}, math::multiplies{}, Number<1>{}));
+    UpLengths up_lengths_;
+    UpLengthsScan up_lengths_scan_;
+    __host__ __device__ constexpr UnMerge() = default;
+    __host__ __device__ constexpr UnMerge(const UpLengths& up_lengths)
+        : up_lengths_{up_lengths},
+          up_lengths_scan_{
+              container_reverse_exclusive_scan(up_lengths, math::multiplies{}, Number<1>{})}
+    {
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        if constexpr(!Use24BitIntegerCalculation)
+        {
+            idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
+            static_for<0, NDimUp - 1, 1>{}(
+                [&](auto i) { idx_low(Number<0>{}) += idx_up[i] * up_lengths_scan_[i]; });
+        }
+        else
+        {
+            idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
+            static_for<0, NDimUp - 1, 1>{}([&](auto i) {
+                idx_low(Number<0>{}) =
+                    (0x00ffffff & idx_low[Number<0>{}]) +
+                    (0x00ffffff & idx_up[i]) * (0x00ffffff & up_lengths_scan_[i]);
+            });
+        }
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx&,
+                                              Number<Hack>) const
+    {
+        CalculateLowerIndex(idx_diff_low, idx_diff_up);
+        idx_low += idx_diff_low;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<UpLengthsScan>::value;
+    }
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("UnMerge, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("up_lengths_scan_");
+        print_multi_index(up_lengths_scan_);
+        printf("}");
+    }
+};
+template <typename LowerIndex>
+struct Freeze
+{
+    LowerIndex low_idx_;
+    __host__ __device__ constexpr Freeze() = default;
+    __host__ __device__ constexpr Freeze(const LowerIndex& low_idx) : low_idx_{low_idx} {}
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 0; }
+    __host__ __device__ static constexpr auto GetUpperLengths() { return Tuple<>{}; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& /* idx_up */) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 0,
+                      "wrong! inconsistent # of dimension");
+        idx_low(Number<0>{}) = low_idx_;
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& /* idx_diff_up */,
+                                                     LowIdx& /* idx_low */,
+                                                     const UpIdx& /* idx_up_new */,
+                                                     Number<Hack>)
+    {
+        idx_diff_low(Number<0>{}) = 0;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<LowerIndex>::value;
+    }
+    __host__ __device__ void Print() const
+    {
+        printf("Freeze");
+        printf("low_idx_ %d", index_t{low_idx_});
+    }
+};
+// Insert a dangling upper dimension without lower dimension
+template <typename UpperLength>
+struct Insert
+{
+    using UpLengths = decltype(make_tuple(UpperLength{}));
+    UpLengths up_lengths_;
+    __host__ __device__ constexpr Insert() = default;
+    __host__ __device__ constexpr Insert(const UpperLength& up_length)
+        : up_lengths_{make_tuple(up_length)}
+    {
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 0; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx&, const UpIdx&) const
+    {
+        static_assert(LowIdx::Size() == 0 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void
+    UpdateLowerIndex(LowIdxDiff&, const UpIdxDiff&, LowIdx&, const UpIdx&, Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 0 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 0 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpperLength>::value;
+    }
+    __host__ __device__ void Print() const
+    {
+        printf("Insert");
+        print_multi_index(up_lengths_);
+    }
+};
+template <typename VectorSize, typename UpLength>
+struct Vectorize
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+    using UpLengths = decltype(make_tuple(UpLength{}));
+    UpLengths up_lengths_;
+    VectorSize vector_size_;
+    __host__ __device__ constexpr Vectorize() = default;
+    __host__ __device__ constexpr Vectorize(const VectorSize& vector_size,
+                                            const UpLength& up_length)
+        : vector_size_{vector_size}, up_lengths_{make_tuple(up_length)}
+    {
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        idx_low(Number<0>{}) = vector_size_ * idx_up[Number<0>{}];
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx&,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        constexpr auto I0 = Number<0>{};
+        idx_diff_low(I0) = vector_size_ * idx_diff_up[I0];
+        idx_low += idx_diff_low;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value;
+    }
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Vectorize, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+template <typename LowLength, typename SliceBegin, typename SliceEnd>
+struct Slice
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+    using UpLengths = decltype(make_tuple(SliceEnd{} - SliceBegin{}));
+    UpLengths up_lengths_;
+    SliceBegin slice_begin_;
+    SliceEnd slice_end_;
+    __host__ __device__ constexpr Slice() = default;
+    __host__ __device__ constexpr Slice(const LowLength&,
+                                        const SliceBegin& slice_begin,
+                                        const SliceEnd& slice_end)
+        : up_lengths_{make_tuple(slice_end - slice_begin)},
+          slice_begin_{slice_begin},
+          slice_end_{slice_end}
+    {
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] + slice_begin_;
+    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx&,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        constexpr auto I0 = Number<0>{};
+        idx_diff_low(I0) = idx_diff_up[I0];
+        idx_low += idx_diff_low;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    template <typename UpIdx>
+    __host__ __device__ constexpr bool IsValidUpperIndexMappedToValidLowerIndex(const UpIdx&) const
+    {
+        return true;
+    }
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<SliceBegin>::value &&
+               is_known_at_compile_time<SliceEnd>::value;
+    }
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Slice, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("slice_begin_ %d", index_t{slice_begin_});
+        printf("slice_end %d", index_t{slice_end_});
+        printf("}");
+    }
+};
+} // namespace ck
+#endif