merge develop

b93575ca · Jing Zhang · 54df59bf · c8a8385f · b93575ca · b93575ca
Commit b93575ca authored Aug 28, 2023 by Jing Zhang
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
    hooks:
    -   id: clang-format
        name: clang-format
-        entry: clang-format-10 -i --style=file
+        entry: clang-format-12 -i --style=file
        language: system
        types_or: [c++, inc]
    -   id: copyright-year-checker

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,8 +25,8 @@ Full documentation for Composable Kernel is not yet available.
 - Added multi-embeddings support (#542).
 - Added Navi3x blockwise GEMM and real GEMM support (#541).
 - Added Navi grouped ConvBwdWeight support (#505).
- Added pool3d forward (#697).
+- Added MaxPool, AvgPool forward (#815).
- Added maxpool backward (#750).
+- Added MaxPool backward (#750).
 ### Changed
 - Changed ...
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.14)
+set(version 1.1.0)
 # Check support for CUDA/HIP in Cmake
-project(composable_kernel)
+project(composable_kernel VERSION ${version})
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 if (DTYPES)
-        add_definitions(-DDTYPES)
+    add_definitions(-DDTYPES)
-        if (DTYPES MATCHES "int8")
+    if (DTYPES MATCHES "int8")
-                add_definitions(-D__int8__)
+        add_definitions(-DCK_ENABLE_INT8)
-        endif()
+        set(CK_ENABLE_INT8 "ON")
-        if (DTYPES MATCHES "fp8")
+    endif()
-                add_definitions(-D__fp8__)
+    if (DTYPES MATCHES "fp8")
-        endif()
+        add_definitions(-DCK_ENABLE_FP8)
-        if (DTYPES MATCHES "fp16")
+        set(CK_ENABLE_FP8 "ON")
-                add_definitions(-D__fp16__)
+    endif()
-        endif()
+    if (DTYPES MATCHES "fp16")
-        if (DTYPES MATCHES "fp32")
+        add_definitions(-DCK_ENABLE_FP16)
-                add_definitions(-D__fp32__)
+        set(CK_ENABLE_FP16 "ON")
-        endif()
+    endif()
-        if (DTYPES MATCHES "fp64")
+    if (DTYPES MATCHES "fp32")
-                add_definitions(-D__fp64__)
+        add_definitions(-DCK_ENABLE_FP32)
-        endif()
+        set(CK_ENABLE_FP32 "ON")
-        if (DTYPES MATCHES "bf16")
+    endif()
-                add_definitions(-D__bf16__)
+    if (DTYPES MATCHES "fp64")
-        endif()
+        add_definitions(-DCK_ENABLE_FP64)
-        message("DTYPES macro set to ${DTYPES}")
+        set(CK_ENABLE_FP64 "ON")
+    endif()
+    if (DTYPES MATCHES "bf16")
+        add_definitions(-DCK_ENABLE_BF16)
+        set(CK_ENABLE_BF16 "ON")
+    endif()
+    message("DTYPES macro set to ${DTYPES}")
 else()
-        add_definitions(-D__int8__ -D__fp8__ -D__fp16__ -D__fp32__ -D__fp64__ -D__bf16__)
+    add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
+    set(CK_ENABLE_ALL_DTYPES "ON")
 endif()
 if(DL_KERNELS)
    add_definitions(-DDL_KERNELS)
+    set(CK_ENABLE_DL_KERNELS "ON")
 endif()
 if(INSTANCES_ONLY)
    add_definitions(-DINSTANCES_ONLY)
+    set(CK_ENABLE_INSTANCES_ONLY "ON")
 endif()
+# CK config file to record supported datatypes, etc.
+configure_file("${PROJECT_SOURCE_DIR}/include/ck/config.h.in" "${PROJECT_BINARY_DIR}/include/ck/config.h")
+# CK version file to record release version as well as git commit hash
+find_package(Git REQUIRED)
+execute_process(COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD OUTPUT_VARIABLE COMMIT_ID OUTPUT_STRIP_TRAILING_WHITESPACE)
+configure_file("${PROJECT_SOURCE_DIR}/include/ck/version.h.in" "${PROJECT_BINARY_DIR}/include/ck/version.h")
 enable_testing()
 set(ROCM_SYMLINK_LIBS OFF)
@@ -50,8 +68,10 @@ include(ROCMInstallSymlinks)
 include(ROCMCreatePackage)
 include(CheckCXXCompilerFlag)
 include(ROCMCheckTargetIds)
-rocm_setup_version(VERSION 0.2.0)
 include(TargetFlags)
+rocm_setup_version(VERSION ${version})
 list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip)
 message("GPU_TARGETS= ${GPU_TARGETS}")
@@ -60,12 +80,43 @@ message("checking which targets are supported")
 #This is the list of targets to be used in case GPU_TARGETS is not set on command line
 #These targets will be filtered and only supported ones will be used
 #Setting GPU_TARGETS on command line will override this list
-rocm_check_target_ids(DEFAULT_GPU_TARGETS
+if(NOT PROFILER_ONLY)
-    TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"
+    rocm_check_target_ids(DEFAULT_GPU_TARGETS
-)
+        TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
+else()
+    add_definitions(-DPROFILER_ONLY)
+    if(GPU_TARGETS)
+        message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx9, gfx10, or gfx11")
+    endif()
+    if(GPU_ARCH MATCHES "gfx9")
+        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942")
+    elseif(GPU_ARCH MATCHES "gfx10")
+        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1030")
+    elseif(GPU_ARCH MATCHES "gfx11")
+        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1100;gfx1101;gfx1102")
+    else()
+        message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx9, gfx10, or gfx11")
+    endif()
+endif()
 message("Supported GPU_TARGETS= ${DEFAULT_GPU_TARGETS}")
 set(AMDGPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " ")
+if(GPU_TARGETS)
+    message("Building CK for the following targets: ${GPU_TARGETS}")
+else()
+    message("Building CK for the following targets: ${AMDGPU_TARGETS}")
+endif()
 find_package(hip)
+# No assumption that HIP kernels are launched with uniform block size for backward compatibility
+# SWDEV-413293 and https://reviews.llvm.org/D155213
+math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}")
+message("hip_version_flat=${hip_VERSION_FLAT}")
+if(${hip_VERSION_FLAT} GREATER 500723302)
+   message("Adding the fno-offload-uniform-block compiler flag")
+   add_compile_options(-fno-offload-uniform-block)
+endif()
 option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
 option(USE_OPT_NAVI3X, "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF)
@@ -284,13 +335,14 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
+# set CK project include directories
 include_directories(BEFORE
+    ${PROJECT_BINARY_DIR}/include
    ${PROJECT_SOURCE_DIR}/include
    ${PROJECT_SOURCE_DIR}/library/include
    ${HIP_INCLUDE_DIRS}
 )
 SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
 if(BUILD_DEV)
    add_compile_options(-Werror)
@@ -310,35 +362,35 @@ IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu
    file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
    set(add_inst 0)
    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp8\" " AND DTYPES MATCHES "fp8")
-            #message("fp8 instance found!")
+        #message("fp8 instance found!")
-            set(add_inst 1)
+        set(add_inst 1)
    endif()
    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp16\"" AND DTYPES MATCHES "fp16")
-            #message("fp16 instance found!")
+        #message("fp16 instance found!")
-            set(add_inst 1)
+        set(add_inst 1)
    endif()
    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp32\"" AND DTYPES MATCHES "fp32")
-            #message("fp32 instance found!")
+        #message("fp32 instance found!")
-            set(add_inst 1)
+        set(add_inst 1)
    endif()
    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp64\"" AND DTYPES MATCHES "fp64")
-            #message("fp64 instance found!")
+        #message("fp64 instance found!")
-            set(add_inst 1)
+        set(add_inst 1)
    endif()
    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"bf16\"" AND DTYPES MATCHES "bf16")
-            #message("bf16 instance found!")
+        #message("bf16 instance found!")
-            set(add_inst 1)
+        set(add_inst 1)
    endif()
    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"int8\"" AND DTYPES MATCHES "int8")
-            #message("int8 instance found!")
+        #message("int8 instance found!")
-            set(add_inst 1)
+        set(add_inst 1)
    endif()
    if(NOT "${cmake_instance}" MATCHES "DTYPES")
-            #message("instance should be built for all types!")
+        #message("instance should be built for all types!")
-            set(add_inst 1)
+        set(add_inst 1)
    endif()
    if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
-      list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
+        list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
    endif()
 ENDIF()
 ENDFOREACH()
@@ -347,6 +399,7 @@ add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES}  SOURCES ${IN
 add_subdirectory(library)
 if(NOT DEFINED INSTANCES_ONLY)
+ if(NOT DEFINED PROFILER_ONLY)
   rocm_package_setup_component(tests
        LIBRARY_NAME composablekernel
        PACKAGE_NAME tests # Prevent -static suffix on package name
@@ -356,21 +409,27 @@ if(NOT DEFINED INSTANCES_ONLY)
        LIBRARY_NAME composablekernel
        PACKAGE_NAME examples
   )
+   add_subdirectory(example)
+   add_subdirectory(test)
   rocm_package_setup_component(profiler
        LIBRARY_NAME composablekernel
        PACKAGE_NAME ckProfiler
   )
-   add_subdirectory(example)
-   add_subdirectory(test)
   add_subdirectory(profiler)
+  else()
+    #When building PROFILER_ONLY, label the package with GPU_ARCH
+    rocm_package_setup_component(profiler
+       LIBRARY_NAME composablekernel
+       PACKAGE_NAME ckProfiler_${GPU_ARCH}
+    )
+    add_subdirectory(profiler)
+  endif()
 endif()
 #Create an interface target for the include only files and call it "composablekernels"
 include(CMakePackageConfigHelpers)
-set(version 1.0.0)
 write_basic_package_version_file(
    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
    VERSION "${version}"
@@ -378,9 +437,9 @@ write_basic_package_version_file(
 )
 configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
-        "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
-        INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
-        NO_CHECK_REQUIRED_COMPONENTS_MACRO
+    NO_CHECK_REQUIRED_COMPONENTS_MACRO
 )
 rocm_install(FILES
@@ -389,6 +448,13 @@ rocm_install(FILES
    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
 )
+# Install CK version and configuration files
+install(FILES
+    ${PROJECT_BINARY_DIR}/include/ck/version.h
+    ${PROJECT_BINARY_DIR}/include/ck/config.h
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck/
+)
 set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
 set(CPACK_RPM_PACKAGE_LICENSE "MIT")

--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -6,9 +6,11 @@ This is the list of developers and contributors to Composable Kernel library
 ## Developers
 [Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2023
-[Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2022
+[Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2023
-[Anthony Chang](https://github.com/rosenrodt), [Chunyu Lai](https://github.com/rocking5566), [Illia Silin](https://github.com/illsilin), [Adam Osewski](https://github.com/aosewski), [Poyen Chen](https://github.com/poyenc), [Rosty Geyyer](https://github.com/geyyer), 2022
+[Anthony Chang](https://github.com/rosenrodt), [Chunyu Lai](https://github.com/rocking5566), [Illia Silin](https://github.com/illsilin), [Adam Osewski](https://github.com/aosewski), [Poyen Chen](https://github.com/poyenc), [Rosty Geyyer](https://github.com/geyyer), [Astha Rai](https://github.com/arai713), [Shi YanXing](https://github.com/Yanxing-Shi), 2022-2023
+[Hari Sadasivan](https://github.com/hsadasiv), [Bartlomiej Kocot](https://github.com/bartekxk), [Bartlomiej Wroblewski](https://github.com/bwroblew), 2023
 Hanwen Chang, 2019-2021,

--- a/Dockerfile
+++ b/Dockerfile
@@ -26,9 +26,14 @@ RUN if [ "$ROCMVERSION" != "5.7" ]; then \
        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
    elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "" ] || [ "$compiler_version" = "amd-stg-open" ]; then \
-         sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
+        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
-         apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
+        apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
-         amdgpu-repo --amdgpu-build=1609671 --rocm-build=compute-rocm-npi-mi300/1354; \
+        amdgpu-repo --amdgpu-build=1609671 --rocm-build=compute-rocm-npi-mi300/1354; \
+    elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "rc1" ]; then \
+        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
+        apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
+        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.7 rel-19 > /etc/apt/sources.list.d/rocm-build.list' && \
+        amdgpu-repo --amdgpu-build=1637781; \
    fi
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
@@ -58,7 +63,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    nano \
    zlib1g-dev \
    openssh-server \
-    clang-format-10 \
+    clang-format-12 \
    kmod && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -612,7 +612,7 @@ def process_results(Map conf=[:]){
 }
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION=rc1
                                              0 21 * * * % ROCMVERSION=5.6;COMPILER_VERSION=;COMPILER_COMMIT=
                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""
@@ -689,7 +689,7 @@ pipeline {
                                -o -iname \'*.cpp.in\' \
                                -o -iname \'*.cl\' \
                                | grep -v 'build/' \
-                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-10 -style=file {} | diff - {}\'"
+                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-12 -style=file {} | diff - {}\'"
                    }
                    steps{
                        buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
@@ -710,8 +710,8 @@ pipeline {
                    }
                    agent{ label rocmnode("gfx908 || gfx90a") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940" """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941" """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
                    }
                    steps{
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')

--- a/client_example/11_grouped_conv_bwd_weight/common.hpp
+++ b/client_example/11_grouped_conv_bwd_weight/common.hpp
@@ -32,63 +32,49 @@ struct SimpleDeviceMem
 };
 template <ck::index_t NumDimSpatial>
-std::size_t GetFlops(ck::index_t G,
+std::size_t GetFlops(const std::array<ck::index_t, NumDimSpatial>& output_lengths,
-                     ck::index_t N,
+                     const std::array<ck::index_t, NumDimSpatial>& filter_lengths)
-                     ck::index_t K,
-                     ck::index_t C,
-                     const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths,
-                     const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths)
 {
+    constexpr ck::index_t spatial_offset = 3;
+    const auto C                         = filter_lengths[2];
    // 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
-    return static_cast<std::size_t>(2) * G * N * K * C *
+    return static_cast<std::size_t>(2) * C *
-           std::accumulate(std::begin(output_spatial_lengths),
+           std::accumulate(std::begin(output_lengths),
-                           std::end(output_spatial_lengths),
+                           std::end(output_lengths),
                           static_cast<std::size_t>(1),
                           std::multiplies<>()) *
-           std::accumulate(std::begin(filter_spatial_lengths),
+           std::accumulate(std::begin(filter_lengths) + spatial_offset,
-                           std::end(filter_spatial_lengths),
+                           std::end(filter_lengths),
                           static_cast<std::size_t>(1),
                           std::multiplies<>());
 }
 template <typename InDataType, ck::index_t NumDimSpatial>
-std::size_t GetInputByte(ck::index_t G,
+std::size_t GetInputByte(const std::array<ck::index_t, NumDimSpatial>& input_lengths)
-                         ck::index_t N,
-                         ck::index_t C,
-                         const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths)
 {
    // sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
-    return sizeof(InDataType) * (G * N * C *
+    return sizeof(InDataType) * (std::accumulate(std::begin(input_lengths),
-                                 std::accumulate(std::begin(input_spatial_lengths),
+                                                 std::end(input_lengths),
-                                                 std::end(input_spatial_lengths),
                                                 static_cast<std::size_t>(1),
                                                 std::multiplies<>()));
 }
 template <typename WeiDataType, ck::index_t NumDimSpatial>
-std::size_t GetWeightByte(ck::index_t G,
+std::size_t GetWeightByte(const std::array<ck::index_t, NumDimSpatial>& filter_lengths)
-                          ck::index_t K,
-                          ck::index_t C,
-                          const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths)
 {
    // sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
-    return sizeof(WeiDataType) * (G * K * C *
+    return sizeof(WeiDataType) * (std::accumulate(std::begin(filter_lengths),
-                                  std::accumulate(std::begin(filter_spatial_lengths),
+                                                  std::end(filter_lengths),
-                                                  std::end(filter_spatial_lengths),
                                                  static_cast<std::size_t>(1),
                                                  std::multiplies<>()));
 }
 template <typename OutDataType, ck::index_t NumDimSpatial>
-std::size_t GetOutputByte(ck::index_t G,
+std::size_t GetOutputByte(const std::array<ck::index_t, NumDimSpatial>& output_lengths)
-                          ck::index_t N,
-                          ck::index_t K,
-                          const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths)
 {
    // sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
-    return sizeof(OutDataType) * (G * N * K *
+    return sizeof(OutDataType) * (std::accumulate(std::begin(output_lengths),
-                                  std::accumulate(std::begin(output_spatial_lengths),
+                                                  std::end(output_lengths),
-                                                  std::end(output_spatial_lengths),
                                                  static_cast<std::size_t>(1),
                                                  std::multiplies<std::size_t>()));
 }
@@ -101,14 +87,11 @@ template <ck::index_t NumDimSpatial,
          typename WeiLayout,
          typename OutLayout>
 bool run_grouped_conv_bwd_weight(
-    const ck::index_t G,
+    const std::array<ck::index_t, NumDimSpatial + 3>& input_lengths,
-    const ck::index_t N,
-    const ck::index_t K,
-    const ck::index_t C,
-    const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths,
-    const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths,
-    const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths,
    const std::array<ck::index_t, NumDimSpatial + 3>& input_strides,
+    const std::array<ck::index_t, NumDimSpatial + 3>& filter_lengths,
+    const std::array<ck::index_t, NumDimSpatial + 3>& weights_strides,
+    const std::array<ck::index_t, NumDimSpatial + 3>& output_lengths,
    const std::array<ck::index_t, NumDimSpatial + 3>& output_strides,
    const std::array<ck::index_t, NumDimSpatial>& conv_filter_strides,
    const std::array<ck::index_t, NumDimSpatial>& conv_filter_dilations,
@@ -117,9 +100,9 @@ bool run_grouped_conv_bwd_weight(
 {
    ck::index_t split_k = 2;
-    SimpleDeviceMem in(GetInputByte<InDataType, NumDimSpatial>(G, N, C, input_spatial_lengths));
+    SimpleDeviceMem in(GetInputByte<InDataType, NumDimSpatial + 3>(input_lengths));
-    SimpleDeviceMem wei(GetWeightByte<WeiDataType, NumDimSpatial>(G, K, C, filter_spatial_lengths));
+    SimpleDeviceMem wei(GetWeightByte<WeiDataType, NumDimSpatial + 3>(filter_lengths));
-    SimpleDeviceMem out(GetOutputByte<OutDataType, NumDimSpatial>(G, N, K, output_spatial_lengths));
+    SimpleDeviceMem out(GetOutputByte<OutDataType, NumDimSpatial + 3>(output_lengths));
    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NumDimSpatial,
                                                                              InLayout,
@@ -143,6 +126,10 @@ bool run_grouped_conv_bwd_weight(
    float best_gb_per_sec = 0;
    float best_tflops     = 0;
+    std::array<ck::index_t, NumDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NumDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NumDimSpatial + 3> b_g_k_c_xs_lengths{};
    // profile device operation instances
    std::cout << "Run all instances and do timing" << std::endl;
@@ -152,14 +139,11 @@ bool run_grouped_conv_bwd_weight(
        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
                                                        out.GetDeviceBuffer(),
-                                                        G,
+                                                        input_lengths,
-                                                        N,
-                                                        K,
-                                                        C,
-                                                        input_spatial_lengths,
-                                                        filter_spatial_lengths,
-                                                        output_spatial_lengths,
                                                        input_strides,
+                                                        filter_lengths,
+                                                        weights_strides,
+                                                        output_lengths,
                                                        output_strides,
                                                        conv_filter_strides,
                                                        conv_filter_dilations,
@@ -176,12 +160,10 @@ bool run_grouped_conv_bwd_weight(
        {
            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
-            std::size_t flop =
+            std::size_t flop      = GetFlops<NumDimSpatial + 3>(output_lengths, filter_lengths);
-                GetFlops<NumDimSpatial>(G, N, K, C, output_spatial_lengths, filter_spatial_lengths);
+            std::size_t num_bytes = GetInputByte<InDataType, NumDimSpatial + 3>(input_lengths) +
-            std::size_t num_bytes =
+                                    GetWeightByte<WeiDataType, NumDimSpatial + 3>(filter_lengths) +
-                GetInputByte<InDataType, NumDimSpatial>(G, N, C, input_spatial_lengths) +
+                                    GetOutputByte<OutDataType, NumDimSpatial + 3>(output_lengths);
-                GetWeightByte<WeiDataType, NumDimSpatial>(G, K, C, filter_spatial_lengths) +
-                GetOutputByte<OutDataType, NumDimSpatial>(G, N, K, output_spatial_lengths);
            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
            float gb_per_sec = num_bytes / 1.E6 / avg_time;
@@ -221,14 +203,11 @@ bool run_grouped_conv_bwd_weight(
        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
                                                        out.GetDeviceBuffer(),
-                                                        G,
+                                                        input_lengths,
-                                                        N,
-                                                        K,
-                                                        C,
-                                                        input_spatial_lengths,
-                                                        filter_spatial_lengths,
-                                                        output_spatial_lengths,
                                                        input_strides,
+                                                        filter_lengths,
+                                                        weights_strides,
+                                                        output_lengths,
                                                        output_strides,
                                                        conv_filter_strides,
                                                        conv_filter_dilations,

--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp
@@ -22,11 +22,12 @@ static constexpr ck::index_t C             = 192;
 static constexpr ck::index_t X             = 3;
 static constexpr ck::index_t Wi            = 28;
 static constexpr ck::index_t Wo            = 28;
-static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Wi};
-static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{X};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, X};
-static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Wo};
-static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{N * Wi * C, Wi* C, C, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{N * Wi * C, Wi* C, 1, C};
-static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{N * Wo * K, Wo* K, K, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{K * X * C, X* C, 1, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{N * Wo * K, Wo* K, 1, K};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1};
 static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1};
@@ -40,14 +41,11 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(G,
+                                       OutLayout>(input_lengths,
-                                                  N,
-                                                  K,
-                                                  C,
-                                                  input_spatial_lengths,
-                                                  filter_spatial_lengths,
-                                                  output_spatial_lengths,
                                                  input_strides,
+                                                  filter_lengths,
+                                                  weights_strides,
+                                                  output_lengths,
                                                  output_strides,
                                                  conv_filter_strides,
                                                  conv_filter_dilations,

--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp
@@ -25,13 +25,15 @@ static constexpr ck::index_t Hi            = 28;
 static constexpr ck::index_t Wi            = 28;
 static constexpr ck::index_t Ho            = 28;
 static constexpr ck::index_t Wo            = 28;
-static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Hi, Wi};
-static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Y, X};
-static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Ho, Wo};
 static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
-    N * Hi * Wi * C, Hi* Wi* C, Wi* C, C, 1};
+    N * Hi * Wi * C, Hi* Wi* C, 1, Wi* C, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
+    K * Y * X * C, Y* X* C, 1, X* C, C};
 static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
-    N * Ho * Wo * K, Ho* Wo* K, Wo* K, K, 1};
+    N * Ho * Wo * K, Ho* Wo* K, 1, Wo* K, K};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1};
 static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
@@ -45,14 +47,11 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(G,
+                                       OutLayout>(input_lengths,
-                                                  N,
-                                                  K,
-                                                  C,
-                                                  input_spatial_lengths,
-                                                  filter_spatial_lengths,
-                                                  output_spatial_lengths,
                                                  input_strides,
+                                                  filter_lengths,
+                                                  weights_strides,
+                                                  output_lengths,
                                                  output_strides,
                                                  conv_filter_strides,
                                                  conv_filter_dilations,

--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp
@@ -28,13 +28,15 @@ static constexpr ck::index_t Wi            = 3;
 static constexpr ck::index_t Do            = 28;
 static constexpr ck::index_t Ho            = 28;
 static constexpr ck::index_t Wo            = 3;
-static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Di, Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Di, Hi, Wi};
-static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Z, Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Z, Y, X};
-static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Do, Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Do, Ho, Wo};
 static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
-    N * Di * Hi * Wi * C, Di* Hi* Wi* C, Hi* Wi* C, Wi* C, C, 1};
+    N * Di * Hi * Wi * C, Di* Hi* Wi* C, 1, Hi* Wi* C, Wi* C, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
+    K * Z * Y * X * C, Z* Y* X* C, 1, Y* X* C, X* C, C};
 static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
-    N * Do * Ho * Wo * K, Do* Ho* Wo* K, Ho* Wo* K, Wo* K, K, 1};
+    N * Do * Ho * Wo * K, Do* Ho* Wo* K, 1, Ho* Wo* K, Wo* K, K};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
 static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
@@ -48,14 +50,11 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(G,
+                                       OutLayout>(input_lengths,
-                                                  N,
-                                                  K,
-                                                  C,
-                                                  input_spatial_lengths,
-                                                  filter_spatial_lengths,
-                                                  output_spatial_lengths,
                                                  input_strides,
+                                                  filter_lengths,
+                                                  weights_strides,
+                                                  output_lengths,
                                                  output_strides,
                                                  conv_filter_strides,
                                                  conv_filter_dilations,

--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp
@@ -28,13 +28,15 @@ static constexpr ck::index_t Wi            = 3;
 static constexpr ck::index_t Do            = 28;
 static constexpr ck::index_t Ho            = 28;
 static constexpr ck::index_t Wo            = 3;
-static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Di, Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Di, Hi, Wi};
-static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Z, Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Z, Y, X};
-static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Do, Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Do, Ho, Wo};
 static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
-    N * Di * Hi * Wi * C, Di* Hi* Wi* C, Hi* Wi* C, Wi* C, C, 1};
+    N * Di * Hi * Wi * C, Di* Hi* Wi* C, 1, Hi* Wi* C, Wi* C, C};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
+    K * Z * Y * X * C, Z* Y* X* C, 1, Y* X* C, X* C, C};
 static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
-    N * Do * Ho * Wo * K, Do* Ho* Wo* K, Ho* Wo* K, Wo* K, K, 1};
+    N * Do * Ho * Wo * K, Do* Ho* Wo* K, 1, Ho* Wo* K, Wo* K, K};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
 static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
 static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
@@ -48,20 +50,16 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(
+                                       OutLayout>(input_lengths,
-               G,
+                                                  input_strides,
-               N,
+                                                  filter_lengths,
-               K,
+                                                  weights_strides,
-               C,
+                                                  output_lengths,
-               {Di, Hi, Wi},
+                                                  output_strides,
-               {Z, Y, X},
+                                                  conv_filter_strides,
-               {Do, Ho, Wo},
+                                                  conv_filter_dilations,
-               {N * Di * Hi * Wi * C, Di * Hi * Wi * C, Hi * Wi * C, Wi * C, C, 1},
+                                                  input_left_pads,
-               {N * Do * Ho * Wo * K, Do * Ho * Wo * K, Ho * Wo * K, Wo * K, K, 1},
+                                                  input_right_pads)
-               {1, 1, 1},
-               {1, 1, 1},
-               {1, 1, 1},
-               {1, 1, 1})
               ? EXIT_SUCCESS
               : EXIT_FAILURE;
 }
--- a/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
+++ b/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
@@ -191,6 +191,12 @@ int main(int argc, char* argv[])
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
        }

--- a/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
+++ b/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
@@ -187,6 +187,12 @@ int main(int argc, char* argv[])
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
        }

--- a/client_example/19_pool_fwd/avg_pool3d_fwd.cpp
+++ b/client_example/19_pool_fwd/avg_pool3d_fwd.cpp
@@ -16,6 +16,9 @@ using InDataType    = ck::half_t;
 using OutDataType   = ck::half_t;
 using IndexDataType = int32_t;
+using InLayout  = ck::tensor_layout::convolution::NDHWC;
+using OutLayout = ck::tensor_layout::convolution::NDHWC;
 constexpr ck::index_t InOutRank  = 5;
 constexpr ck::index_t WindowRank = 3;
 #if 0
@@ -44,33 +47,41 @@ struct SimpleDeviceMem
 int main(int argc, char* argv[])
 {
-    ck::index_t N               = 2;
+    ck::index_t N                 = 2;
-    ck::index_t C               = 32;
+    ck::index_t C                 = 32;
-    ck::index_t Z               = 2;
+    ck::index_t Z                 = 2;
-    ck::index_t Y               = 2;
+    ck::index_t Y                 = 2;
-    ck::index_t X               = 2;
+    ck::index_t X                 = 2;
-    ck::index_t Di              = 30;
+    ck::index_t Di                = 30;
-    ck::index_t Hi              = 30;
+    ck::index_t Hi                = 30;
-    ck::index_t Wi              = 30;
+    ck::index_t Wi                = 30;
-    ck::index_t window_stride_d = 2;
+    ck::index_t window_stride_d   = 2;
-    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_h   = 2;
-    ck::index_t window_stride_w = 2;
+    ck::index_t window_stride_w   = 2;
-    ck::index_t in_left_pad_d   = 1;
+    ck::index_t window_dilation_d = 1;
-    ck::index_t in_left_pad_h   = 1;
+    ck::index_t window_dilation_h = 1;
-    ck::index_t in_left_pad_w   = 1;
+    ck::index_t window_dilation_w = 1;
-    ck::index_t in_right_pad_d  = 1;
+    ck::index_t in_left_pad_d     = 1;
-    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_left_pad_h     = 1;
-    ck::index_t in_right_pad_w  = 1;
+    ck::index_t in_left_pad_w     = 1;
+    ck::index_t in_right_pad_d    = 1;
-    ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Z) / window_stride_d + 1;
+    ck::index_t in_right_pad_h    = 1;
-    ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
+    ck::index_t in_right_pad_w    = 1;
-    ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+    const ck::index_t Zs = (Z - 1) * window_dilation_d + 1;
+    const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
+    const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
+    ck::index_t Do       = (Di + in_left_pad_d + in_right_pad_d - Zs) / window_stride_d + 1;
+    ck::index_t Ho       = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
+    ck::index_t Wo       = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
    // Pool API only support the order of NCDHW
    std::vector<ck::index_t> in_length              = {N, C, Di, Hi, Wi};
    std::vector<ck::index_t> out_length             = {N, C, Do, Ho, Wo};
    std::vector<ck::index_t> window_spatial_lengths = {Z, Y, X};
-    std::vector<ck::index_t> window_strides   = {window_stride_d, window_stride_h, window_stride_w};
+    std::vector<ck::index_t> window_strides = {window_stride_d, window_stride_h, window_stride_w};
+    std::vector<ck::index_t> window_dilations{
+        window_dilation_d, window_dilation_h, window_dilation_w};
    std::vector<ck::index_t> input_left_pads  = {in_left_pad_d, in_left_pad_h, in_left_pad_w};
    std::vector<ck::index_t> input_right_pads = {in_right_pad_d, in_right_pad_h, in_right_pad_w};
@@ -90,6 +101,8 @@ int main(int argc, char* argv[])
                                                                 InDataType,
                                                                 OutDataType,
                                                                 IndexDataType,
+                                                                 InLayout,
+                                                                 OutLayout,
                                                                 ReduceOpId,
                                                                 OutputIndex>;
@@ -122,6 +135,7 @@ int main(int argc, char* argv[])
            out_tensor_stride,
            out_tensor_stride,
            window_strides,
+            window_dilations,
            input_left_pads,
            input_right_pads,
            {2, 3, 4});
@@ -181,6 +195,7 @@ int main(int argc, char* argv[])
            out_tensor_stride,
            out_tensor_stride,
            window_strides,
+            window_dilations,
            input_left_pads,
            input_right_pads,
            {2, 3, 4});

--- a/client_example/19_pool_fwd/max_pool2d_fwd.cpp
+++ b/client_example/19_pool_fwd/max_pool2d_fwd.cpp
@@ -10,14 +10,18 @@
 #include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp"
+#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
 using InDataType    = ck::half_t;
 using OutDataType   = ck::half_t;
 using IndexDataType = int32_t;
-constexpr ck::index_t InOutRank  = 4;
+// We use pool3d to implement pool2d in this example
-constexpr ck::index_t WindowRank = 2;
+using InLayout  = ck::tensor_layout::convolution::NDHWC;
+using OutLayout = ck::tensor_layout::convolution::NDHWC;
+constexpr ck::index_t InOutRank  = 5;
+constexpr ck::index_t WindowRank = 3;
 #if 1
 constexpr auto ReduceOpId  = ck::ReduceTensorOp::MAX;
 constexpr bool OutputIndex = true;
@@ -42,31 +46,66 @@ struct SimpleDeviceMem
    void* p_mem_;
 };
+void TransformPool2dparamToPool3d(std::vector<ck::index_t>& input_lengths,
+                                  std::vector<ck::index_t>& window_lengths,
+                                  std::vector<ck::index_t>& output_lengths,
+                                  std::vector<ck::index_t>& input_stride,
+                                  std::vector<ck::index_t>& output_stride,
+                                  std::vector<ck::index_t>& indices_stride,
+                                  std::vector<ck::index_t>& window_strides,
+                                  std::vector<ck::index_t>& window_dilations,
+                                  std::vector<ck::index_t>& input_left_pads,
+                                  std::vector<ck::index_t>& input_right_pads,
+                                  std::vector<ck::index_t>& pooling_dims)
+{
+    // NCHW to NCDHW
+    input_lengths.insert(input_lengths.begin() + 2, 1);
+    output_lengths.insert(output_lengths.begin() + 2, 1);
+    input_stride.insert(input_stride.begin() + 2, 0);
+    output_stride.insert(output_stride.begin() + 2, 0);
+    indices_stride.insert(indices_stride.begin() + 2, 0);
+    // YX to ZYX
+    window_lengths.insert(window_lengths.begin(), 1);
+    window_strides.insert(window_strides.begin(), 0);
+    window_dilations.insert(window_dilations.begin(), 0);
+    input_left_pads.insert(input_left_pads.begin(), 0);
+    input_right_pads.insert(input_right_pads.begin(), 0);
+    pooling_dims = {2, 3, 4};
+}
 int main(int argc, char* argv[])
 {
-    ck::index_t N               = 2;
+    ck::index_t N                 = 2;
-    ck::index_t C               = 32;
+    ck::index_t C                 = 32;
-    ck::index_t Y               = 2;
+    ck::index_t Y                 = 2;
-    ck::index_t X               = 2;
+    ck::index_t X                 = 2;
-    ck::index_t Hi              = 30;
+    ck::index_t Hi                = 30;
-    ck::index_t Wi              = 30;
+    ck::index_t Wi                = 30;
-    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_h   = 2;
-    ck::index_t window_stride_w = 2;
+    ck::index_t window_stride_w   = 2;
-    ck::index_t in_left_pad_h   = 1;
+    ck::index_t window_dilation_h = 1;
-    ck::index_t in_left_pad_w   = 1;
+    ck::index_t window_dilation_w = 1;
-    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_left_pad_h     = 1;
-    ck::index_t in_right_pad_w  = 1;
+    ck::index_t in_left_pad_w     = 1;
+    ck::index_t in_right_pad_h    = 1;
-    ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
+    ck::index_t in_right_pad_w    = 1;
-    ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+    const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
+    const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
+    ck::index_t Ho       = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
+    ck::index_t Wo       = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
    // Pool API only support the order of NCHW
    std::vector<ck::index_t> in_length              = {N, C, Hi, Wi};
    std::vector<ck::index_t> out_length             = {N, C, Ho, Wo};
    std::vector<ck::index_t> window_spatial_lengths = {Y, X};
    std::vector<ck::index_t> window_strides         = {window_stride_h, window_stride_w};
+    std::vector<ck::index_t> window_dilations       = {window_dilation_h, window_dilation_w};
    std::vector<ck::index_t> input_left_pads        = {in_left_pad_h, in_left_pad_w};
    std::vector<ck::index_t> input_right_pads       = {in_right_pad_h, in_right_pad_w};
+    std::vector<ck::index_t> pooling_dims           = {2, 3};
    std::size_t in_tensor_size  = N * C * Hi * Wi;
    std::size_t out_tensor_size = N * C * Ho * Wo;
@@ -75,6 +114,18 @@ int main(int argc, char* argv[])
    std::vector<ck::index_t> in_tensor_stride  = {C * Hi * Wi, 1, Wi * C, C};
    std::vector<ck::index_t> out_tensor_stride = {C * Ho * Wo, 1, Wo * C, C};
+    TransformPool2dparamToPool3d(in_length,
+                                 window_spatial_lengths,
+                                 out_length,
+                                 in_tensor_stride,
+                                 out_tensor_stride,
+                                 out_tensor_stride,
+                                 window_strides,
+                                 window_dilations,
+                                 input_left_pads,
+                                 input_right_pads,
+                                 pooling_dims);
    SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
    SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
    SimpleDeviceMem out_indices_device_buf(sizeof(IndexDataType) * out_tensor_size);
@@ -84,6 +135,8 @@ int main(int argc, char* argv[])
                                                                 InDataType,
                                                                 OutDataType,
                                                                 IndexDataType,
+                                                                 InLayout,
+                                                                 OutLayout,
                                                                 ReduceOpId,
                                                                 OutputIndex>;
@@ -116,9 +169,10 @@ int main(int argc, char* argv[])
            out_tensor_stride,
            out_tensor_stride,
            window_strides,
+            window_dilations,
            input_left_pads,
            input_right_pads,
-            {2, 3});
+            pooling_dims);
        auto invoker_ptr = op_ptr->MakeInvokerPointer();
@@ -175,9 +229,10 @@ int main(int argc, char* argv[])
            out_tensor_stride,
            out_tensor_stride,
            window_strides,
+            window_dilations,
            input_left_pads,
            input_right_pads,
-            {2, 3});
+            pooling_dims);
        auto invoker_ptr = op_ptr->MakeInvokerPointer();

--- a/client_example/20_splitk_gemm/CMakeLists.txt
+++ b/client_example/20_splitk_gemm/CMakeLists.txt
+add_executable(client_splitK_gemm splitK_gemm_fp16_f8.cpp)
+target_link_libraries(client_splitK_gemm PRIVATE composable_kernel::device_operations)
--- a/client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp
+++ b/client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
+using F8  = ck::f8_t;
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+using ADataType = F8;
+using BDataType = F16;
+using CDataType = F16;
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+    ck::index_t KBatch = 1;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 8)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+        StrideA = std::stoi(argv[4]);
+        StrideB = std::stoi(argv[5]);
+        StrideC = std::stoi(argv[6]);
+        KBatch = std::stoi(argv[7]);
+    }
+    else
+    {
+        printf("arg1 to 7: M, N, K, StrideA, StrideB, StrideC, KBatch\n");
+        exit(0);
+    }
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        CDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        StrideC,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        c_element_op,
+                                                        KBatch);
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t flop = std::size_t(2) * M * N * K;
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        StrideC,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        c_element_op,
+                                                        KBatch);
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -3,31 +3,52 @@ project(ck_app)
 add_compile_options(-std=c++17)
 if (DTYPES)
-        add_definitions(-DDTYPES)
+    add_definitions(-DDTYPES)
-        if (DTYPES MATCHES "int8")
+    if (DTYPES MATCHES "int8")
-                add_definitions(-D__int8__)
+        add_definitions(-DCK_ENABLE_INT8)
+        if(NOT DEFINED ${CK_ENABLE_INT8})
+            set(CK_ENABLE_INT8 "ON")
        endif()
-        if (DTYPES MATCHES "fp8")
+    endif()
-                add_definitions(-D__fp8__)
+    if (DTYPES MATCHES "fp8")
+        add_definitions(-DCK_ENABLE_FP8)
+        if(NOT DEFINED ${CK_ENABLE_FP8})
+            set(CK_ENABLE_FP8 "ON")
        endif()
-        if (DTYPES MATCHES "fp16")
+    endif()
-                add_definitions(-D__fp16__)
+    if (DTYPES MATCHES "fp16")
+        add_definitions(-DCK_ENABLE_FP16)
+        if(NOT DEFINED ${CK_ENABLE_FP16})
+            set(CK_ENABLE_FP16 "ON")
        endif()
-        if (DTYPES MATCHES "fp32")
+    endif()
-                add_definitions(-D__fp32__)
+    if (DTYPES MATCHES "fp32")
+        add_definitions(-DCK_ENABLE_FP32)
+        if(NOT DEFINED ${CK_ENABLE_FP32})
+            set(CK_ENABLE_FP32 "ON")
        endif()
-        if (DTYPES MATCHES "fp64")
+    endif()
-                add_definitions(-D__fp64__)
+    if (DTYPES MATCHES "fp64")
+        add_definitions(-DCK_ENABLE_FP64)
+        if(NOT DEFINED ${CK_ENABLE_FP64})
+            set(CK_ENABLE_FP64 "ON")
        endif()
-        if (DTYPES MATCHES "bf16")
+    endif()
-                add_definitions(-D__bf16__)
+    if (DTYPES MATCHES "bf16")
+        add_definitions(-DCK_ENABLE_BF16)
+        if(NOT DEFINED ${CK_ENABLE_BF16})
+            set(CK_ENABLE_BF16 "ON")
        endif()
-        message("DTYPES macro set to ${DTYPES}")
+    endif()
+    message("DTYPES macro set to ${DTYPES}")
 else()
-        add_definitions(-D__int8__ -D__fp8__ -D__fp16__ -D__fp32__ -D__fp64__ -D__bf16__)
+    add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
+    if(NOT DEFINED ${CK_ENABLE_ALL_DTYPES})
+        set(CK_ENABLE_ALL_DTYPES "ON")
+    endif()
 endif()
-find_package(composable_kernel 1.0.0 COMPONENTS device_operations)
+find_package(composable_kernel COMPONENTS device_operations)
 find_package(hip REQUIRED PATHS /opt/rocm)
 message(STATUS "Build with HIP ${hip_VERSION}")

--- a/docs/API_Reference_Guide.rst
+++ b/docs/API_Reference_Guide.rst
@@ -7,8 +7,8 @@ API Reference Guide
 Introduction
 =================
-This document contains details of the APIs for the Composable Kernel (CK) library and introduces some of the key design
+This document contains details of the APIs for the Composable Kernel (CK) library and introduces
-principles that are used to write new classes that extend CK functionality.
+some of the key design principles that are used to write new classes that extend CK functionality.
 =================
 Using CK API
@@ -30,8 +30,8 @@ DeviceMem
 Kernels For Flashattention
 ---------------------------
-The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`.  This sections lists the classes that are
+The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This sections lists
-used in the CK GPU implementation of Flashattention.
+the classes that are used in the CK GPU implementation of Flashattention.
 **Gridwise classes**

--- a/docs/Supported_Primitives_Guide.rst
+++ b/docs/Supported_Primitives_Guide.rst
@@ -2,15 +2,16 @@
 Supported Primitives Guide
 ==========================
-This document contains details of supported primitives in Composable Kernel (CK). In contrast to the API Reference
+This document contains details of supported primitives in Composable Kernel (CK). In contrast to the
-Guide, the Supported Primitives Guide is an introduction to the math which underpins the algorithms implemented in CK.
+API Reference Guide, the Supported Primitives Guide is an introduction to the math which underpins
+the algorithms implemented in CK.
 ------------
 Softmax
 ------------
-For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can decompose the softmax of concatenated
+For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can decompose the
-:math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as,
+softmax of concatenated :math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as,
 .. math::
   :nowrap:
@@ -25,8 +26,8 @@ For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can d
 where :math:`f(x^{(j)}) = \exp( x^{(j)} - m(x^{(j)}) )` is of size :math:`B` and
 :math:`z(x^{(j)}) = f(x_1^{(j)})+ \ldots+ f(x_B^{(j)})` is a scalar.
-For a matrix :math:`X` composed of :math:`T_r \times T_c` tiles, :math:`X_{ij}`, of size :math:`B_r \times B_c` we can
+For a matrix :math:`X` composed of :math:`T_r \times T_c` tiles, :math:`X_{ij}`, of size
-compute the row-wise softmax as follows.
+:math:`B_r \times B_c` we can compute the row-wise softmax as follows.
 For :math:`j` from :math:`1` to :math:`T_c`, and :math:`i` from :math:`1` to :math:`T_r` calculate,