Merge branch 'develop' into migx-jit-lib

8f9c0243 · Alan Turner · 181ea79a · c8a8385f · 8f9c0243 · 8f9c0243
Commit 8f9c0243 authored Sep 22, 2023 by Alan Turner
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+repos:
+-   repo: local
+    hooks:
+    -   id: clang-format
+        name: clang-format
+        entry: clang-format-12 -i --style=file
+        language: system
+        types_or: [c++, inc]
+    -   id: copyright-year-checker
+        name: copyright-year-checker
+        entry: script/check_copyright_year.sh
+        verbose: false
+        language: script
+        types: [c++]
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,11 @@ Full documentation for Composable Kernel is not yet available.
 - Improve proformance of normalization kernel

 ### Added
+- Added new cmake flag "DL_KERNELS" must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances.
+- Added new cmake flag "DTYPES" which could be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instance of select data types.
+- Added new cmake flag "INSTANCES_ONLY" which will only build CK library and instances without the tests, examples, or profiler.
+- Added new feature: if GPU_TARGETS is not set on cmake command line, CK will be built for all targets supported by compiler.
+- Added support on MI300A/MI300X.
 - Added support on NAVI3x.
 - Added user tutorial (#563).
 - Added more instances for irregular GEMM sizes (#560).
@@ -20,6 +25,8 @@ Full documentation for Composable Kernel is not yet available.
 - Added multi-embeddings support (#542).
 - Added Navi3x blockwise GEMM and real GEMM support (#541).
 - Added Navi grouped ConvBwdWeight support (#505).
+- Added MaxPool, AvgPool forward (#815).
+- Added MaxPool backward (#750).

 ### Changed
 - Changed ...
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.14)

+set(version 1.1.0)
 # Check support for CUDA/HIP in Cmake
-project(composable_kernel)
+project(composable_kernel VERSION ${version})

 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")

+if (DTYPES)
+    add_definitions(-DDTYPES)
+    if (DTYPES MATCHES "int8")
+        add_definitions(-DCK_ENABLE_INT8)
+        set(CK_ENABLE_INT8 "ON")
+    endif()
+    if (DTYPES MATCHES "fp8")
+        add_definitions(-DCK_ENABLE_FP8)
+        set(CK_ENABLE_FP8 "ON")
+    endif()
+    if (DTYPES MATCHES "fp16")
+        add_definitions(-DCK_ENABLE_FP16)
+        set(CK_ENABLE_FP16 "ON")
+    endif()
+    if (DTYPES MATCHES "fp32")
+        add_definitions(-DCK_ENABLE_FP32)
+        set(CK_ENABLE_FP32 "ON")
+    endif()
+    if (DTYPES MATCHES "fp64")
+        add_definitions(-DCK_ENABLE_FP64)
+        set(CK_ENABLE_FP64 "ON")
+    endif()
+    if (DTYPES MATCHES "bf16")
+        add_definitions(-DCK_ENABLE_BF16)
+        set(CK_ENABLE_BF16 "ON")
+    endif()
+    message("DTYPES macro set to ${DTYPES}")
+else()
+    add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
+    set(CK_ENABLE_ALL_DTYPES "ON")
+endif()
+
+if(DL_KERNELS)
+    add_definitions(-DDL_KERNELS)
+    set(CK_ENABLE_DL_KERNELS "ON")
+endif()
+
+if(INSTANCES_ONLY)
+    add_definitions(-DINSTANCES_ONLY)
+    set(CK_ENABLE_INSTANCES_ONLY "ON")
+endif()
+
+# CK config file to record supported datatypes, etc.
+configure_file("${PROJECT_SOURCE_DIR}/include/ck/config.h.in" "${PROJECT_BINARY_DIR}/include/ck/config.h")
+
+# CK version file to record release version as well as git commit hash
+find_package(Git REQUIRED)
+execute_process(COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD OUTPUT_VARIABLE COMMIT_ID OUTPUT_STRIP_TRAILING_WHITESPACE)
+configure_file("${PROJECT_SOURCE_DIR}/include/ck/version.h.in" "${PROJECT_BINARY_DIR}/include/ck/version.h")
+
 enable_testing()

 set(ROCM_SYMLINK_LIBS OFF)
@@ -16,11 +67,77 @@ include(ROCMSetupVersion)
 include(ROCMInstallSymlinks)
 include(ROCMCreatePackage)
 include(CheckCXXCompilerFlag)
-
-rocm_setup_version(VERSION 0.2.0)
+include(ROCMCheckTargetIds)
 include(TargetFlags)
+
+rocm_setup_version(VERSION ${version})
+
 list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip)

+message("GPU_TARGETS= ${GPU_TARGETS}")
+
+message("checking which targets are supported")
+#This is the list of targets to be used in case GPU_TARGETS is not set on command line
+#These targets will be filtered and only supported ones will be used
+#Setting GPU_TARGETS on command line will override this list
+if(NOT PROFILER_ONLY)
+    rocm_check_target_ids(DEFAULT_GPU_TARGETS
+        TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
+else()
+    add_definitions(-DPROFILER_ONLY)
+    if(GPU_TARGETS)
+        message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx9, gfx10, or gfx11")
+    endif()
+    if(GPU_ARCH MATCHES "gfx9")
+        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942")
+    elseif(GPU_ARCH MATCHES "gfx10")
+        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1030")
+    elseif(GPU_ARCH MATCHES "gfx11")
+        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1100;gfx1101;gfx1102")
+    else()
+        message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx9, gfx10, or gfx11")
+    endif()
+endif()
+
+message("Supported GPU_TARGETS= ${DEFAULT_GPU_TARGETS}")
+
+set(AMDGPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " ")
+
+if(GPU_TARGETS)
+    message("Building CK for the following targets: ${GPU_TARGETS}")
+else()
+    message("Building CK for the following targets: ${AMDGPU_TARGETS}")
+endif()
+find_package(hip)
+# No assumption that HIP kernels are launched with uniform block size for backward compatibility
+# SWDEV-413293 and https://reviews.llvm.org/D155213
+math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}")
+message("hip_version_flat=${hip_VERSION_FLAT}")
+if(${hip_VERSION_FLAT} GREATER 500723302)
+   message("Adding the fno-offload-uniform-block compiler flag")
+   add_compile_options(-fno-offload-uniform-block)
+endif()
+
+option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
+option(USE_OPT_NAVI3X, "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF)
+
+if(USE_BITINT_EXTENSION_INT4)
+    add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+    add_compile_options(-Wno-bit-int-extension)
+    message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
+endif()
+
+if(USE_OPT_NAVI3X)
+    add_compile_options(-mcumode)
+    add_compile_options(-mno-wavefrontsize64)
+    message("CK compiled with USE_OPT_NAVI3X set to ${USE_OPT_NAVI3X}")
+endif()
+
+## Threads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
+link_libraries(Threads::Threads)
+
 ## C++
 enable_language(CXX)
 set(CMAKE_CXX_STANDARD 17)
@@ -242,8 +359,9 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)

-
+# set CK project include directories
 include_directories(BEFORE
+    ${PROJECT_BINARY_DIR}/include
    ${PROJECT_SOURCE_DIR}/include
    ${PROJECT_SOURCE_DIR}/library/include
    ${HIP_INCLUDE_DIRS}
@@ -259,18 +377,54 @@ if (NOT CK_BUILD_JIT_LIB)
    endif()
    message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")

-    
-
    file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
    file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
    set(CK_DEVICE_INSTANCES)
    FOREACH(subdir_path ${dir_list})
-        IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
-        list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
-        ENDIF()
+    set(target_dir)
+    IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
+        set(cmake_instance)
+        file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
+        set(add_inst 0)
+        if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp8\" " AND DTYPES MATCHES "fp8")
+            #message("fp8 instance found!")
+            set(add_inst 1)
+        endif()
+        if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp16\"" AND DTYPES MATCHES "fp16")
+            #message("fp16 instance found!")
+            set(add_inst 1)
+        endif()
+        if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp32\"" AND DTYPES MATCHES "fp32")
+            #message("fp32 instance found!")
+            set(add_inst 1)
+        endif()
+        if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp64\"" AND DTYPES MATCHES "fp64")
+            #message("fp64 instance found!")
+            set(add_inst 1)
+        endif()
+        if("${cmake_instance}" MATCHES "DTYPES MATCHES \"bf16\"" AND DTYPES MATCHES "bf16")
+            #message("bf16 instance found!")
+            set(add_inst 1)
+        endif()
+        if("${cmake_instance}" MATCHES "DTYPES MATCHES \"int8\"" AND DTYPES MATCHES "int8")
+            #message("int8 instance found!")
+            set(add_inst 1)
+        endif()
+        if(NOT "${cmake_instance}" MATCHES "DTYPES")
+            #message("instance should be built for all types!")
+            set(add_inst 1)
+        endif()
+        if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
+            list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
+        endif()
+    ENDIF()
    ENDFOREACH()
+
    add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES}  SOURCES ${INSTANCE_FILES})
+    add_subdirectory(library)

+    if(NOT DEFINED INSTANCES_ONLY)
+    if(NOT DEFINED PROFILER_ONLY)
    rocm_package_setup_component(tests
            LIBRARY_NAME composablekernel
            PACKAGE_NAME tests # Prevent -static suffix on package name
@@ -280,32 +434,35 @@ if (NOT CK_BUILD_JIT_LIB)
            LIBRARY_NAME composablekernel
            PACKAGE_NAME examples
    )
+    add_subdirectory(example)
+    add_subdirectory(test)

    rocm_package_setup_component(profiler
            LIBRARY_NAME composablekernel
            PACKAGE_NAME ckProfiler
    )
-
-    
-    add_subdirectory(example)
    add_subdirectory(profiler)
-
+    else()
+        #When building PROFILER_ONLY, label the package with GPU_ARCH
+        rocm_package_setup_component(profiler
+        LIBRARY_NAME composablekernel
+        PACKAGE_NAME ckProfiler_${GPU_ARCH}
+        )
+        add_subdirectory(profiler)
+    endif()
+    endif()
 else()
    rocm_package_setup_component(jit_library
-            LIBRARY_NAME composablekernel
-            PACKAGE_NAME jit_library
+        LIBRARY_NAME composablekernel
+        PACKAGE_NAME jit_library
    )
-    
+    add_subdirectory(library)
+    add_subdirectory(test)
 endif()

-
-add_subdirectory(library)
-add_subdirectory(test)
-
 #Create an interface target for the include only files and call it "composablekernels"
 include(CMakePackageConfigHelpers)

-set(version 1.0.0)
 write_basic_package_version_file(
    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
    VERSION "${version}"
@@ -313,9 +470,9 @@ write_basic_package_version_file(
 )

 configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
-        "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
-        INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
-        NO_CHECK_REQUIRED_COMPONENTS_MACRO
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+    NO_CHECK_REQUIRED_COMPONENTS_MACRO
 )

 rocm_install(FILES
@@ -324,6 +481,13 @@ rocm_install(FILES
    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
 )

+# Install CK version and configuration files
+install(FILES
+    ${PROJECT_BINARY_DIR}/include/ck/version.h
+    ${PROJECT_BINARY_DIR}/include/ck/config.h
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck/
+)
+
 set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
 set(CPACK_RPM_PACKAGE_LICENSE "MIT")


--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -6,9 +6,11 @@ This is the list of developers and contributors to Composable Kernel library
 ## Developers
 [Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2023

-[Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2022
+[Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2023

-[Anthony Chang](https://github.com/rosenrodt), [Chunyu Lai](https://github.com/rocking5566), [Illia Silin](https://github.com/illsilin), [Adam Osewski](https://github.com/aosewski), [Poyen Chen](https://github.com/poyenc), [Rosty Geyyer](https://github.com/geyyer), 2022
+[Anthony Chang](https://github.com/rosenrodt), [Chunyu Lai](https://github.com/rocking5566), [Illia Silin](https://github.com/illsilin), [Adam Osewski](https://github.com/aosewski), [Poyen Chen](https://github.com/poyenc), [Rosty Geyyer](https://github.com/geyyer), [Astha Rai](https://github.com/arai713), [Shi YanXing](https://github.com/Yanxing-Shi), 2022-2023
+
+[Hari Sadasivan](https://github.com/hsadasiv), [Bartlomiej Kocot](https://github.com/bartekxk), [Bartlomiej Wroblewski](https://github.com/bwroblew), 2023

 Hanwen Chang, 2019-2021,


--- a/Dockerfile
+++ b/Dockerfile
@@ -12,24 +12,32 @@ RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
 RUN chmod 1777 /tmp
 RUN apt-get update
 RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
-RUN if [ "$ROCMVERSION" != "5.6" ]; then \
+
+ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
+RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
+
+RUN wget https://repo.radeon.com/amdgpu-install/5.6/ubuntu/focal/amdgpu-install_5.6.50600-1_all.deb  --no-check-certificate
+RUN apt-get update && \
+DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    ./amdgpu-install_5.6.50600-1_all.deb
+
+RUN if [ "$ROCMVERSION" != "5.7" ]; then \
        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
-        sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"; \
-    elif [ "$ROCMVERSION" = "5.6" ] && [ "$compiler_version" = "" ]; then \
-         sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \
-         apt update && apt-get install -y ./amd-nonfree-radeon_20.04-1_all.deb && \
-         amdgpu-repo --amdgpu-build=1567752 --rocm-build=compute-rocm-dkms-no-npi-hipclang/11914 && \
-         amdgpu-install -y --usecase=rocm --no-dkms; \
-    elif [ "$ROCMVERSION" = "5.6" ] && [ "$compiler_version" = "rc3" ] || [ "$compiler_version" = "amd-stg-open" ]; then \
-         sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.6-20.04-1_all.deb" && \
-         apt update && apt-get install -y ./amdgpu-install-internal_5.6-20.04-1_all.deb && \
-         sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.6 rel-45  > /etc/apt/sources.list.d/rocm-build.list' && \
-         amdgpu-repo --amdgpu-build=1602498 && amdgpu-install -y --usecase=rocm --no-dkms; \
+        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
+        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
+    elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "" ] || [ "$compiler_version" = "amd-stg-open" ]; then \
+        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
+        apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
+        amdgpu-repo --amdgpu-build=1609671 --rocm-build=compute-rocm-npi-mi300/1354; \
+    elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "rc1" ]; then \
+        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
+        apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
+        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.7 rel-19 > /etc/apt/sources.list.d/rocm-build.list' && \
+        amdgpu-repo --amdgpu-build=1637781; \
    fi

-RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
-RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
+RUN amdgpu-install -y --usecase=rocm --no-dkms

 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
@@ -45,6 +53,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    libpthread-stubs0-dev \
    llvm-amdgpu \
    pkg-config \
+    python \
    python3 \
    python3-dev \
    python3-pip \
@@ -54,12 +63,16 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    nano \
    zlib1g-dev \
    openssh-server \
-    clang-format-10 \
+    clang-format-12 \
    kmod && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

 #Install latest version of cmake
+RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip
+RUN gunzip /usr/local/bin/ninja.gz
+RUN chmod a+x /usr/local/bin/ninja
+RUN git clone https://github.com/nico/ninjatracing.git
 RUN apt purge --auto-remove -y cmake
 RUN apt update
 RUN apt install -y software-properties-common lsb-release

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -11,6 +11,20 @@ def show_node_info() {
    """
 }

+def nthreads() {
+    def nproc = sh(returnStdout: true, script: 'nproc')
+    echo "Number of cores: ${nproc}"
+    def n = nproc.toInteger()
+    if (n > 32){
+        n /= 2
+    }
+    if (n > 64){
+        n = 64
+    }
+    echo "Number of threads used for building: ${n}"
+    return n
+}
+
 def runShell(String command){
    def responseCode = sh returnStatus: true, script: "${command} > tmp.txt"
    def output = readFile(file: "tmp.txt")
@@ -19,7 +33,7 @@ def runShell(String command){

 def getDockerImageName(){
    def img
-    if (params.ROCMVERSION != "5.6"){
+    if (params.ROCMVERSION != "5.7"){
       if (params.COMPILER_VERSION == "") {
           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
       }
@@ -219,7 +233,8 @@ def cmake_build(Map conf=[:]){
        """
    def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
    // reduce parallelism when compiling, clang uses too much memory
-    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j\$(( \$(nproc) / 2 )) ${config_targets}")
+    def nt = nthreads()
+    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j${nt} ${config_targets}")
    def execute_cmd = conf.get("execute_cmd", "")

    def cmd = conf.get("cmd", """
@@ -461,7 +476,7 @@ def Build_CK(Map conf=[:]){
                        else{
                            echo "GPU is OK"
                        }
-                        if ( runShell('grep -n "gfx1030" clinfo.log') ){
+                        if ( runShell('grep -n "gfx1030" clinfo.log') || runShell('grep -n "gfx1101" clinfo.log') ){
                            navi_node = 1
                        }
                    }
@@ -482,7 +497,7 @@ def Build_CK(Map conf=[:]){
                        else{
                            echo "GPU is OK"
                        }
-                        if ( runShell('grep -n "gfx1030" clinfo.log') ){
+                        if ( runShell('grep -n "gfx1030" clinfo.log') || runShell('grep -n "gfx1101" clinfo.log') ){
                            navi_node = 1
                        }
                    }
@@ -493,8 +508,8 @@ def Build_CK(Map conf=[:]){
                {
                    cmake_build(conf)
                    dir("build"){
-                        //run tests and examples 	
-                        sh 'make -j\$(( \$(nproc) / 2 )) check'
+                        //run tests and examples
+                        sh 'make -j check'
                        if (navi_node == 0 ){
                            //we only need the ckProfiler to run the performance tests, so we pack and stash it
                            //do not stash profiler on Navi nodes
@@ -597,8 +612,8 @@ def process_results(Map conf=[:]){
 }

 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true
-                                              0 21 * * * % ROCMVERSION=5.5;COMPILER_VERSION=release;COMPILER_COMMIT=
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION=rc1
+                                              0 21 * * * % ROCMVERSION=5.6;COMPILER_VERSION=;COMPILER_COMMIT=
                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""

 pipeline {
@@ -674,7 +689,7 @@ pipeline {
                                -o -iname \'*.cpp.in\' \
                                -o -iname \'*.cl\' \
                                | grep -v 'build/' \
-                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-10 -style=file {} | diff - {}\'"
+                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-12 -style=file {} | diff - {}\'"
                    }
                    steps{
                        buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
@@ -695,8 +710,8 @@ pipeline {
                    }
                    agent{ label rocmnode("gfx908 || gfx90a") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941" """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
                    }
                    steps{
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
@@ -717,7 +732,7 @@ pipeline {
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                    }
                }
-                stage("Build CK and run Tests on Navi")
+                stage("Build CK and run Tests on Navi21")
                {
                    when {
                        beforeAgent true
@@ -725,7 +740,7 @@ pipeline {
                    }
                    agent{ label rocmnode("navi21") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" """ 
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON """ 
                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """

                    }
@@ -733,6 +748,22 @@ pipeline {
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                    }
                }
+                stage("Build CK and run Tests on Navi32")
+                {
+                    when {
+                        beforeAgent true
+                        expression { !params.RUN_FULL_QA.toBoolean() }
+                    }
+                    agent{ label rocmnode("navi32") }
+                    environment{
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DDTYPES="fp16;fp32;bf16" -DGPU_TARGETS="gfx1101" """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -DDTYPES="fp16;fp32;bf16" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
+
+                    }
+                    steps{
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                    }
+                }
            }
        }


--- a/README.md
+++ b/README.md
@@ -52,6 +52,8 @@ CK is released under the MIT license. [License File](/LICENSE)
 ```bash
 DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
 ```
+Pre-built dockers are available from this public repo: 
+https://hub.docker.com/r/rocm/composable_kernel/tags

 ## Launch docker

@@ -76,12 +78,26 @@ mkdir build && cd build
 cmake                                                                                             \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
-D CMAKE_CXX_FLAGS="-O3"                                                                          \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D GPU_TARGETS="gfx908;gfx90a"                                                                    \
 ..
 ```

+If GPU_TARGETS is not set on the cmake command line, CK will be built for all targets supported by the 
+current compiler.
+
+
+Additional cmake flags can be used to significantly speed-up the build:
+
+INSTANCES_ONLY (by default is OFF) must be set to ON in order to build only the instances and library
+while skipping all tests, examples, and profiler. This is useful for libraries that use CK as a dependency.
+
+DTYPES (by default not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instances 
+of select data types only. Currently, building of int8 instances is taking a lot of time (the compiler fix is in the works).
+
+DL_KERNELS (by default is OFF) must be set to ON in order to build the gemm_dl and batched_gemm_multi_d_dl 
+instances. Those instances are only needed for the NAVI2x platforms.
+
 ### Build examples and tests

 ```bash
@@ -109,6 +125,24 @@ make install

 Instructions for using CK as a pre-built kernel library are under [client_example](/client_example)

+## Contributing
+
+When you contribute to Composable Kernel, make sure to run `clang-format` on all the changed files. We highly recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:
+
+```bash
+sudo script/install_precommit.sh
+```
+
+This way, `pre-commit` will add the appropriate hooks to your local repository and automatically run `clang-format` (and possibly additional checks) before any commit is created.
+
+If you need to uninstall hooks from the repository, you can do so by running the following command:
+
+```bash
+script/uninstall_precommit.sh
+```
+
+If for any reason, you need to temporarily disable precommit hooks, you can add the `--no-verify` option to the `git commit` command.
+
 ## Caveat
 ### Kernel Timing and Verification


--- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
@@ -172,18 +172,19 @@ int main()
            BLayout,
            CLayout>();

-    const auto normalize_ptrs =
-        ck::tensor_operation::device::instance::get_device_normalize_from_mean_meansquare_instances<
-            CDataType,
-            ReduceDataType,
-            ReduceDataType,
-            GammaDataType,
-            BetaDataType,
-            LayerNormOutDataType>();
-
    std::cout << "found " << gemm_reduce_ptrs.size()
              << " gemm_reduceMean_reduceSquareMean instances" << std::endl;

+    using NormalizeDeviceOp = ck::tensor_operation::device::DeviceElementwise<
+        ck::Tuple<CDataType, ReduceDataType, ReduceDataType, GammaDataType, BetaDataType>,
+        ck::Tuple<LayerNormOutDataType>,
+        ck::tensor_operation::element_wise::Normalize,
+        2>;
+
+    const auto normalize_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            NormalizeDeviceOp>::GetInstances();
+
    std::cout << "found " << normalize_ptrs.size() << " normalize instances" << std::endl;

    auto f_matrix_space_size =

--- a/client_example/06_softmax/softmax4d.cpp
+++ b/client_example/06_softmax/softmax4d.cpp
@@ -53,12 +53,35 @@ int main(int argc, char* argv[])
    SimpleDeviceMem in(sizeof(InDataType) * num_elements);
    SimpleDeviceMem out(sizeof(OutDataType) * num_elements);

-    using DeviceOp = ck::tensor_operation::device::
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+    using DeviceOp = ck::tensor_operation::device::DeviceSoftmax<InDataType,
+                                                                 AccDataType,
+                                                                 OutDataType,
+                                                                 PassThrough,
+                                                                 PassThrough,
+                                                                 Rank,
+                                                                 NumReduceDim>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
        DeviceOp>::GetInstances();

+    auto& generic_op_ptr = op_ptrs[0];
+
+    auto generic_argument_ptr = generic_op_ptr->MakeArgumentPointer(in_lengths,
+                                                                    in_strides,
+                                                                    reduce_dims,
+                                                                    alpha,
+                                                                    beta,
+                                                                    in.GetDeviceBuffer(),
+                                                                    out.GetDeviceBuffer(),
+                                                                    PassThrough{},
+                                                                    PassThrough{});
+
+    if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "The generic kernel instance should be able to support any input shapes");
+    };
+
    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;

    std::string best_op_name;
@@ -74,11 +97,6 @@ int main(int argc, char* argv[])
    {
        auto& op_ptr = op_ptrs[i];

-        if(op_ptr->GetRank() != Rank || op_ptr->GetNumReduceDim() != NumReduceDim)
-        {
-            continue;
-        }
-
        auto argument_ptr   = op_ptr->MakeArgumentPointer(in_lengths,
                                                        in_strides,
                                                        reduce_dims,

--- a/client_example/09_quantization/CMakeLists.txt
+++ b/client_example/09_quantization/CMakeLists.txt
+if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
 add_executable(client_conv2d_fwd_bias_tanh_perchannel_quantization conv2d_fwd_bias_tanh_perchannel_quantization.cpp)
 target_link_libraries(client_conv2d_fwd_bias_tanh_perchannel_quantization PRIVATE composable_kernel::device_operations)

@@ -18,3 +19,4 @@ target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable

 add_executable(client_gemm_quantization gemm_quantization.cpp)
 target_link_libraries(client_gemm_quantization PRIVATE composable_kernel::device_operations)
+endif()
--- a/client_example/11_grouped_conv_bwd_weight/common.hpp
+++ b/client_example/11_grouped_conv_bwd_weight/common.hpp
@@ -32,63 +32,49 @@ struct SimpleDeviceMem
 };

 template <ck::index_t NumDimSpatial>
-std::size_t GetFlops(ck::index_t G,
-                     ck::index_t N,
-                     ck::index_t K,
-                     ck::index_t C,
-                     const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths,
-                     const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths)
+std::size_t GetFlops(const std::array<ck::index_t, NumDimSpatial>& output_lengths,
+                     const std::array<ck::index_t, NumDimSpatial>& filter_lengths)
 {
+    constexpr ck::index_t spatial_offset = 3;
+    const auto C                         = filter_lengths[2];
    // 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
-    return static_cast<std::size_t>(2) * G * N * K * C *
-           std::accumulate(std::begin(output_spatial_lengths),
-                           std::end(output_spatial_lengths),
+    return static_cast<std::size_t>(2) * C *
+           std::accumulate(std::begin(output_lengths),
+                           std::end(output_lengths),
                           static_cast<std::size_t>(1),
                           std::multiplies<>()) *
-           std::accumulate(std::begin(filter_spatial_lengths),
-                           std::end(filter_spatial_lengths),
+           std::accumulate(std::begin(filter_lengths) + spatial_offset,
+                           std::end(filter_lengths),
                           static_cast<std::size_t>(1),
                           std::multiplies<>());
 }

 template <typename InDataType, ck::index_t NumDimSpatial>
-std::size_t GetInputByte(ck::index_t G,
-                         ck::index_t N,
-                         ck::index_t C,
-                         const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths)
+std::size_t GetInputByte(const std::array<ck::index_t, NumDimSpatial>& input_lengths)
 {
    // sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
-    return sizeof(InDataType) * (G * N * C *
-                                 std::accumulate(std::begin(input_spatial_lengths),
-                                                 std::end(input_spatial_lengths),
+    return sizeof(InDataType) * (std::accumulate(std::begin(input_lengths),
+                                                 std::end(input_lengths),
                                                 static_cast<std::size_t>(1),
                                                 std::multiplies<>()));
 }

 template <typename WeiDataType, ck::index_t NumDimSpatial>
-std::size_t GetWeightByte(ck::index_t G,
-                          ck::index_t K,
-                          ck::index_t C,
-                          const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths)
+std::size_t GetWeightByte(const std::array<ck::index_t, NumDimSpatial>& filter_lengths)
 {
    // sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
-    return sizeof(WeiDataType) * (G * K * C *
-                                  std::accumulate(std::begin(filter_spatial_lengths),
-                                                  std::end(filter_spatial_lengths),
+    return sizeof(WeiDataType) * (std::accumulate(std::begin(filter_lengths),
+                                                  std::end(filter_lengths),
                                                  static_cast<std::size_t>(1),
                                                  std::multiplies<>()));
 }

 template <typename OutDataType, ck::index_t NumDimSpatial>
-std::size_t GetOutputByte(ck::index_t G,
-                          ck::index_t N,
-                          ck::index_t K,
-                          const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths)
+std::size_t GetOutputByte(const std::array<ck::index_t, NumDimSpatial>& output_lengths)
 {
    // sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
-    return sizeof(OutDataType) * (G * N * K *
-                                  std::accumulate(std::begin(output_spatial_lengths),
-                                                  std::end(output_spatial_lengths),
+    return sizeof(OutDataType) * (std::accumulate(std::begin(output_lengths),
+                                                  std::end(output_lengths),
                                                  static_cast<std::size_t>(1),
                                                  std::multiplies<std::size_t>()));
 }
@@ -101,13 +87,12 @@ template <ck::index_t NumDimSpatial,
          typename WeiLayout,
          typename OutLayout>
 bool run_grouped_conv_bwd_weight(
-    ck::index_t G,
-    ck::index_t N,
-    ck::index_t K,
-    ck::index_t C,
-    const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths,
-    const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths,
-    const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths,
+    const std::array<ck::index_t, NumDimSpatial + 3>& input_lengths,
+    const std::array<ck::index_t, NumDimSpatial + 3>& input_strides,
+    const std::array<ck::index_t, NumDimSpatial + 3>& filter_lengths,
+    const std::array<ck::index_t, NumDimSpatial + 3>& weights_strides,
+    const std::array<ck::index_t, NumDimSpatial + 3>& output_lengths,
+    const std::array<ck::index_t, NumDimSpatial + 3>& output_strides,
    const std::array<ck::index_t, NumDimSpatial>& conv_filter_strides,
    const std::array<ck::index_t, NumDimSpatial>& conv_filter_dilations,
    const std::array<ck::index_t, NumDimSpatial>& input_left_pads,
@@ -115,9 +100,9 @@ bool run_grouped_conv_bwd_weight(
 {

    ck::index_t split_k = 2;
-    SimpleDeviceMem in(GetInputByte<InDataType, NumDimSpatial>(G, N, C, input_spatial_lengths));
-    SimpleDeviceMem wei(GetWeightByte<WeiDataType, NumDimSpatial>(G, K, C, filter_spatial_lengths));
-    SimpleDeviceMem out(GetOutputByte<OutDataType, NumDimSpatial>(G, N, K, output_spatial_lengths));
+    SimpleDeviceMem in(GetInputByte<InDataType, NumDimSpatial + 3>(input_lengths));
+    SimpleDeviceMem wei(GetWeightByte<WeiDataType, NumDimSpatial + 3>(filter_lengths));
+    SimpleDeviceMem out(GetOutputByte<OutDataType, NumDimSpatial + 3>(output_lengths));

    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NumDimSpatial,
                                                                              InLayout,
@@ -141,6 +126,10 @@ bool run_grouped_conv_bwd_weight(
    float best_gb_per_sec = 0;
    float best_tflops     = 0;

+    std::array<ck::index_t, NumDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NumDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NumDimSpatial + 3> b_g_k_c_xs_lengths{};
+
    // profile device operation instances
    std::cout << "Run all instances and do timing" << std::endl;

@@ -150,13 +139,12 @@ bool run_grouped_conv_bwd_weight(
        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
                                                        out.GetDeviceBuffer(),
-                                                        G,
-                                                        N,
-                                                        K,
-                                                        C,
-                                                        input_spatial_lengths,
-                                                        filter_spatial_lengths,
-                                                        output_spatial_lengths,
+                                                        input_lengths,
+                                                        input_strides,
+                                                        filter_lengths,
+                                                        weights_strides,
+                                                        output_lengths,
+                                                        output_strides,
                                                        conv_filter_strides,
                                                        conv_filter_dilations,
                                                        input_left_pads,
@@ -172,12 +160,10 @@ bool run_grouped_conv_bwd_weight(
        {
            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});

-            std::size_t flop =
-                GetFlops<NumDimSpatial>(G, N, K, C, output_spatial_lengths, filter_spatial_lengths);
-            std::size_t num_bytes =
-                GetInputByte<InDataType, NumDimSpatial>(G, N, C, input_spatial_lengths) +
-                GetWeightByte<WeiDataType, NumDimSpatial>(G, K, C, filter_spatial_lengths) +
-                GetOutputByte<OutDataType, NumDimSpatial>(G, N, K, output_spatial_lengths);
+            std::size_t flop      = GetFlops<NumDimSpatial + 3>(output_lengths, filter_lengths);
+            std::size_t num_bytes = GetInputByte<InDataType, NumDimSpatial + 3>(input_lengths) +
+                                    GetWeightByte<WeiDataType, NumDimSpatial + 3>(filter_lengths) +
+                                    GetOutputByte<OutDataType, NumDimSpatial + 3>(output_lengths);

            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
            float gb_per_sec = num_bytes / 1.E6 / avg_time;
@@ -217,13 +203,12 @@ bool run_grouped_conv_bwd_weight(
        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
                                                        out.GetDeviceBuffer(),
-                                                        G,
-                                                        N,
-                                                        K,
-                                                        C,
-                                                        input_spatial_lengths,
-                                                        filter_spatial_lengths,
-                                                        output_spatial_lengths,
+                                                        input_lengths,
+                                                        input_strides,
+                                                        filter_lengths,
+                                                        weights_strides,
+                                                        output_lengths,
+                                                        output_strides,
                                                        conv_filter_strides,
                                                        conv_filter_dilations,
                                                        input_left_pads,

--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp
@@ -22,6 +22,16 @@ static constexpr ck::index_t C             = 192;
 static constexpr ck::index_t X             = 3;
 static constexpr ck::index_t Wi            = 28;
 static constexpr ck::index_t Wo            = 28;
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, X};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{N * Wi * C, Wi* C, 1, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{K * X * C, X* C, 1, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{N * Wo * K, Wo* K, 1, K};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1};

 int main()
 {
@@ -31,7 +41,16 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(G, N, K, C, {Wi}, {X}, {Wo}, {1}, {1}, {1}, {1})
+                                       OutLayout>(input_lengths,
+                                                  input_strides,
+                                                  filter_lengths,
+                                                  weights_strides,
+                                                  output_lengths,
+                                                  output_strides,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads)
               ? EXIT_SUCCESS
               : EXIT_FAILURE;
 }
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp
@@ -25,6 +25,19 @@ static constexpr ck::index_t Hi            = 28;
 static constexpr ck::index_t Wi            = 28;
 static constexpr ck::index_t Ho            = 28;
 static constexpr ck::index_t Wo            = 28;
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
+    N * Hi * Wi * C, Hi* Wi* C, 1, Wi* C, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
+    K * Y * X * C, Y* X* C, 1, X* C, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
+    N * Ho * Wo * K, Ho* Wo* K, 1, Wo* K, K};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};

 int main()
 {
@@ -34,8 +47,16 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(
-               G, N, K, C, {Hi, Wi}, {Y, X}, {Ho, Wo}, {1, 1}, {1, 1}, {1, 1}, {1, 1})
+                                       OutLayout>(input_lengths,
+                                                  input_strides,
+                                                  filter_lengths,
+                                                  weights_strides,
+                                                  output_lengths,
+                                                  output_strides,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads)
               ? EXIT_SUCCESS
               : EXIT_FAILURE;
 }
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp
@@ -28,6 +28,19 @@ static constexpr ck::index_t Wi            = 3;
 static constexpr ck::index_t Do            = 28;
 static constexpr ck::index_t Ho            = 28;
 static constexpr ck::index_t Wo            = 3;
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Di, Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Z, Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Do, Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
+    N * Di * Hi * Wi * C, Di* Hi* Wi* C, 1, Hi* Wi* C, Wi* C, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
+    K * Z * Y * X * C, Z* Y* X* C, 1, Y* X* C, X* C, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
+    N * Do * Ho * Wo * K, Do* Ho* Wo* K, 1, Ho* Wo* K, Wo* K, K};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};

 int main()
 {
@@ -37,17 +50,16 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(G,
-                                                  N,
-                                                  K,
-                                                  C,
-                                                  {Di, Hi, Wi},
-                                                  {Z, Y, X},
-                                                  {Do, Ho, Wo},
-                                                  {1, 1, 1},
-                                                  {1, 1, 1},
-                                                  {1, 1, 1},
-                                                  {1, 1, 1})
+                                       OutLayout>(input_lengths,
+                                                  input_strides,
+                                                  filter_lengths,
+                                                  weights_strides,
+                                                  output_lengths,
+                                                  output_strides,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads)
               ? EXIT_SUCCESS
               : EXIT_FAILURE;
 }
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp
@@ -28,6 +28,19 @@ static constexpr ck::index_t Wi            = 3;
 static constexpr ck::index_t Do            = 28;
 static constexpr ck::index_t Ho            = 28;
 static constexpr ck::index_t Wo            = 3;
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Di, Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Z, Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Do, Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
+    N * Di * Hi * Wi * C, Di* Hi* Wi* C, 1, Hi* Wi* C, Wi* C, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
+    K * Z * Y * X * C, Z* Y* X* C, 1, Y* X* C, X* C, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
+    N * Do * Ho * Wo * K, Do* Ho* Wo* K, 1, Ho* Wo* K, Wo* K, K};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};

 int main()
 {
@@ -37,17 +50,16 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(G,
-                                                  N,
-                                                  K,
-                                                  C,
-                                                  {Di, Hi, Wi},
-                                                  {Z, Y, X},
-                                                  {Do, Ho, Wo},
-                                                  {1, 1, 1},
-                                                  {1, 1, 1},
-                                                  {1, 1, 1},
-                                                  {1, 1, 1})
+                                       OutLayout>(input_lengths,
+                                                  input_strides,
+                                                  filter_lengths,
+                                                  weights_strides,
+                                                  output_lengths,
+                                                  output_strides,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads)
               ? EXIT_SUCCESS
               : EXIT_FAILURE;
 }
--- a/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
+++ b/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
@@ -191,6 +191,12 @@ int main(int argc, char* argv[])

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+
+            SimpleDeviceMem workspace(workspace_sz);
+
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
        }


--- a/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
+++ b/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
@@ -187,6 +187,12 @@ int main(int argc, char* argv[])

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+
+            SimpleDeviceMem workspace(workspace_sz);
+
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
        }


--- a/client_example/18_groupnorm/groupnorm_swish.cpp
+++ b/client_example/18_groupnorm/groupnorm_swish.cpp
@@ -72,6 +72,30 @@ int main(int argc, char* argv[])

    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;

+    const auto& generic_op_ptr = op_ptrs[0];
+
+    auto generic_argument_ptr =
+        generic_op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
+                                            xy_strides,         // xStrides
+                                            gamma_beta_strides, // gammaStrides
+                                            gamma_beta_strides, // betaStrides
+                                            xy_strides,         // yStrides
+                                            {1, 2, 4},          // reduceDims
+                                            1e-6,
+                                            x_device_buf.GetDeviceBuffer(),
+                                            gamma_device_buf.GetDeviceBuffer(),
+                                            beta_device_buf.GetDeviceBuffer(),
+                                            y_device_buf.GetDeviceBuffer(),
+                                            nullptr,
+                                            nullptr,
+                                            Swish{});
+
+    if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "The generic kernel instance should be able to support any input shapes");
+    };
+
    std::string best_op_name;
    bool found            = false;
    int best_op_id        = -1;

--- a/client_example/19_pool_fwd/avg_pool3d_fwd.cpp
+++ b/client_example/19_pool_fwd/avg_pool3d_fwd.cpp
@@ -16,6 +16,9 @@ using InDataType    = ck::half_t;
 using OutDataType   = ck::half_t;
 using IndexDataType = int32_t;

+using InLayout  = ck::tensor_layout::convolution::NDHWC;
+using OutLayout = ck::tensor_layout::convolution::NDHWC;
+
 constexpr ck::index_t InOutRank  = 5;
 constexpr ck::index_t WindowRank = 3;
 #if 0
@@ -44,33 +47,41 @@ struct SimpleDeviceMem

 int main(int argc, char* argv[])
 {
-    ck::index_t N               = 2;
-    ck::index_t C               = 32;
-    ck::index_t Z               = 2;
-    ck::index_t Y               = 2;
-    ck::index_t X               = 2;
-    ck::index_t Di              = 30;
-    ck::index_t Hi              = 30;
-    ck::index_t Wi              = 30;
-    ck::index_t window_stride_d = 2;
-    ck::index_t window_stride_h = 2;
-    ck::index_t window_stride_w = 2;
-    ck::index_t in_left_pad_d   = 1;
-    ck::index_t in_left_pad_h   = 1;
-    ck::index_t in_left_pad_w   = 1;
-    ck::index_t in_right_pad_d  = 1;
-    ck::index_t in_right_pad_h  = 1;
-    ck::index_t in_right_pad_w  = 1;
-
-    ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Z) / window_stride_d + 1;
-    ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
-    ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+    ck::index_t N                 = 2;
+    ck::index_t C                 = 32;
+    ck::index_t Z                 = 2;
+    ck::index_t Y                 = 2;
+    ck::index_t X                 = 2;
+    ck::index_t Di                = 30;
+    ck::index_t Hi                = 30;
+    ck::index_t Wi                = 30;
+    ck::index_t window_stride_d   = 2;
+    ck::index_t window_stride_h   = 2;
+    ck::index_t window_stride_w   = 2;
+    ck::index_t window_dilation_d = 1;
+    ck::index_t window_dilation_h = 1;
+    ck::index_t window_dilation_w = 1;
+    ck::index_t in_left_pad_d     = 1;
+    ck::index_t in_left_pad_h     = 1;
+    ck::index_t in_left_pad_w     = 1;
+    ck::index_t in_right_pad_d    = 1;
+    ck::index_t in_right_pad_h    = 1;
+    ck::index_t in_right_pad_w    = 1;
+
+    const ck::index_t Zs = (Z - 1) * window_dilation_d + 1;
+    const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
+    const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
+    ck::index_t Do       = (Di + in_left_pad_d + in_right_pad_d - Zs) / window_stride_d + 1;
+    ck::index_t Ho       = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
+    ck::index_t Wo       = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;

    // Pool API only support the order of NCDHW
    std::vector<ck::index_t> in_length              = {N, C, Di, Hi, Wi};
    std::vector<ck::index_t> out_length             = {N, C, Do, Ho, Wo};
    std::vector<ck::index_t> window_spatial_lengths = {Z, Y, X};
-    std::vector<ck::index_t> window_strides   = {window_stride_d, window_stride_h, window_stride_w};
+    std::vector<ck::index_t> window_strides = {window_stride_d, window_stride_h, window_stride_w};
+    std::vector<ck::index_t> window_dilations{
+        window_dilation_d, window_dilation_h, window_dilation_w};
    std::vector<ck::index_t> input_left_pads  = {in_left_pad_d, in_left_pad_h, in_left_pad_w};
    std::vector<ck::index_t> input_right_pads = {in_right_pad_d, in_right_pad_h, in_right_pad_w};

@@ -90,6 +101,8 @@ int main(int argc, char* argv[])
                                                                 InDataType,
                                                                 OutDataType,
                                                                 IndexDataType,
+                                                                 InLayout,
+                                                                 OutLayout,
                                                                 ReduceOpId,
                                                                 OutputIndex>;

@@ -122,6 +135,7 @@ int main(int argc, char* argv[])
            out_tensor_stride,
            out_tensor_stride,
            window_strides,
+            window_dilations,
            input_left_pads,
            input_right_pads,
            {2, 3, 4});
@@ -181,6 +195,7 @@ int main(int argc, char* argv[])
            out_tensor_stride,
            out_tensor_stride,
            window_strides,
+            window_dilations,
            input_left_pads,
            input_right_pads,
            {2, 3, 4});

--- a/client_example/19_pool_fwd/max_pool2d_fwd.cpp
+++ b/client_example/19_pool_fwd/max_pool2d_fwd.cpp
@@ -10,14 +10,18 @@
 #include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

-#include "ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp"
+#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"

 using InDataType    = ck::half_t;
 using OutDataType   = ck::half_t;
 using IndexDataType = int32_t;

-constexpr ck::index_t InOutRank  = 4;
-constexpr ck::index_t WindowRank = 2;
+// We use pool3d to implement pool2d in this example
+using InLayout  = ck::tensor_layout::convolution::NDHWC;
+using OutLayout = ck::tensor_layout::convolution::NDHWC;
+
+constexpr ck::index_t InOutRank  = 5;
+constexpr ck::index_t WindowRank = 3;
 #if 1
 constexpr auto ReduceOpId  = ck::ReduceTensorOp::MAX;
 constexpr bool OutputIndex = true;
@@ -42,31 +46,66 @@ struct SimpleDeviceMem
    void* p_mem_;
 };

+void TransformPool2dparamToPool3d(std::vector<ck::index_t>& input_lengths,
+                                  std::vector<ck::index_t>& window_lengths,
+                                  std::vector<ck::index_t>& output_lengths,
+                                  std::vector<ck::index_t>& input_stride,
+                                  std::vector<ck::index_t>& output_stride,
+                                  std::vector<ck::index_t>& indices_stride,
+                                  std::vector<ck::index_t>& window_strides,
+                                  std::vector<ck::index_t>& window_dilations,
+                                  std::vector<ck::index_t>& input_left_pads,
+                                  std::vector<ck::index_t>& input_right_pads,
+                                  std::vector<ck::index_t>& pooling_dims)
+{
+    // NCHW to NCDHW
+    input_lengths.insert(input_lengths.begin() + 2, 1);
+    output_lengths.insert(output_lengths.begin() + 2, 1);
+    input_stride.insert(input_stride.begin() + 2, 0);
+    output_stride.insert(output_stride.begin() + 2, 0);
+    indices_stride.insert(indices_stride.begin() + 2, 0);
+
+    // YX to ZYX
+    window_lengths.insert(window_lengths.begin(), 1);
+    window_strides.insert(window_strides.begin(), 0);
+    window_dilations.insert(window_dilations.begin(), 0);
+    input_left_pads.insert(input_left_pads.begin(), 0);
+    input_right_pads.insert(input_right_pads.begin(), 0);
+
+    pooling_dims = {2, 3, 4};
+}
+
 int main(int argc, char* argv[])
 {
-    ck::index_t N               = 2;
-    ck::index_t C               = 32;
-    ck::index_t Y               = 2;
-    ck::index_t X               = 2;
-    ck::index_t Hi              = 30;
-    ck::index_t Wi              = 30;
-    ck::index_t window_stride_h = 2;
-    ck::index_t window_stride_w = 2;
-    ck::index_t in_left_pad_h   = 1;
-    ck::index_t in_left_pad_w   = 1;
-    ck::index_t in_right_pad_h  = 1;
-    ck::index_t in_right_pad_w  = 1;
-
-    ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
-    ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+    ck::index_t N                 = 2;
+    ck::index_t C                 = 32;
+    ck::index_t Y                 = 2;
+    ck::index_t X                 = 2;
+    ck::index_t Hi                = 30;
+    ck::index_t Wi                = 30;
+    ck::index_t window_stride_h   = 2;
+    ck::index_t window_stride_w   = 2;
+    ck::index_t window_dilation_h = 1;
+    ck::index_t window_dilation_w = 1;
+    ck::index_t in_left_pad_h     = 1;
+    ck::index_t in_left_pad_w     = 1;
+    ck::index_t in_right_pad_h    = 1;
+    ck::index_t in_right_pad_w    = 1;
+
+    const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
+    const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
+    ck::index_t Ho       = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
+    ck::index_t Wo       = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;

    // Pool API only support the order of NCHW
    std::vector<ck::index_t> in_length              = {N, C, Hi, Wi};
    std::vector<ck::index_t> out_length             = {N, C, Ho, Wo};
    std::vector<ck::index_t> window_spatial_lengths = {Y, X};
    std::vector<ck::index_t> window_strides         = {window_stride_h, window_stride_w};
+    std::vector<ck::index_t> window_dilations       = {window_dilation_h, window_dilation_w};
    std::vector<ck::index_t> input_left_pads        = {in_left_pad_h, in_left_pad_w};
    std::vector<ck::index_t> input_right_pads       = {in_right_pad_h, in_right_pad_w};
+    std::vector<ck::index_t> pooling_dims           = {2, 3};

    std::size_t in_tensor_size  = N * C * Hi * Wi;
    std::size_t out_tensor_size = N * C * Ho * Wo;
@@ -75,6 +114,18 @@ int main(int argc, char* argv[])
    std::vector<ck::index_t> in_tensor_stride  = {C * Hi * Wi, 1, Wi * C, C};
    std::vector<ck::index_t> out_tensor_stride = {C * Ho * Wo, 1, Wo * C, C};

+    TransformPool2dparamToPool3d(in_length,
+                                 window_spatial_lengths,
+                                 out_length,
+                                 in_tensor_stride,
+                                 out_tensor_stride,
+                                 out_tensor_stride,
+                                 window_strides,
+                                 window_dilations,
+                                 input_left_pads,
+                                 input_right_pads,
+                                 pooling_dims);
+
    SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
    SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
    SimpleDeviceMem out_indices_device_buf(sizeof(IndexDataType) * out_tensor_size);
@@ -84,6 +135,8 @@ int main(int argc, char* argv[])
                                                                 InDataType,
                                                                 OutDataType,
                                                                 IndexDataType,
+                                                                 InLayout,
+                                                                 OutLayout,
                                                                 ReduceOpId,
                                                                 OutputIndex>;

@@ -116,9 +169,10 @@ int main(int argc, char* argv[])
            out_tensor_stride,
            out_tensor_stride,
            window_strides,
+            window_dilations,
            input_left_pads,
            input_right_pads,
-            {2, 3});
+            pooling_dims);

        auto invoker_ptr = op_ptr->MakeInvokerPointer();

@@ -175,9 +229,10 @@ int main(int argc, char* argv[])
            out_tensor_stride,
            out_tensor_stride,
            window_strides,
+            window_dilations,
            input_left_pads,
            input_right_pads,
-            {2, 3});
+            pooling_dims);

        auto invoker_ptr = op_ptr->MakeInvokerPointer();