sync from public repo

a4522ae3 · illsilin · 1f127242 · e0594d08 · a4522ae3 · a4522ae3
Commit a4522ae3 authored Nov 06, 2024 by illsilin
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -137,7 +137,7 @@ if(GPU_TARGETS)
 else()
    set(USER_GPU_TARGETS 0)
 endif()
-find_package(hip)
+find_package(hip REQUIRED)
 # No assumption that HIP kernels are launched with uniform block size for backward compatibility
 # SWDEV-413293 and https://reviews.llvm.org/D155213
 math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}")
@@ -145,20 +145,20 @@ message("hip_version_flat=${hip_VERSION_FLAT}")

 message("checking which targets are supported")
 #In order to build just the CK library (without tests and examples) for all supported GPU targets
-#use -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" 
+#use -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
 #the GPU_TARGETS flag will be reset in this case in order to avoid conflicts.
 #
 #In order to build CK along with all tests and examples it should be OK to set GPU_TARGETS to just 1 or 2 similar architectures.
 if(NOT ENABLE_ASAN_PACKAGING)
    if(NOT WIN32 AND ${hip_VERSION_FLAT} LESS 600300000)
        # WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above
-        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx950")
+        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
    else()
        set(CK_GPU_TARGETS "gfx950")
    endif()
 else()
    #build CK only for xnack-supported targets when using ASAN
-    set(CK_GPU_TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx940:xnack+;gfx941:xnack+;gfx942:xnack+")
+    set(CK_GPU_TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+")
 endif()

 #if user set GPU_ARCHS on the cmake command line, overwrite default target list with user's list
@@ -170,7 +170,10 @@ else()
        set(CK_GPU_TARGETS ${GPU_TARGETS})
    endif()
 endif()
-
+#if the user did not set GPU_TARGETS, delete whatever was set by HIP package
+if(NOT USER_GPU_TARGETS)
+    set(GPU_TARGETS "")
+endif()
 #make sure all the targets on the list are actually supported by the current compiler
 rocm_check_target_ids(SUPPORTED_GPU_TARGETS
        TARGETS ${CK_GPU_TARGETS})
@@ -187,6 +190,10 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1
    add_definitions(-DCK_USE_WMMA)
    set(CK_USE_WMMA "ON")
 endif()
+option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
+if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908"))
+    add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH)
+endif()

 # CK config file to record supported datatypes, etc.
 configure_file(include/ck/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/config.h)
@@ -198,6 +205,13 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302)
    add_compile_options(-fno-offload-uniform-block)
  endif()
 endif()
+if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500500000)
+  check_cxx_compiler_flag("-mllvm --lsr-drop-solution=1" HAS_LSR_DROP_SOLUTION)
+  if(HAS_LSR_DROP_SOLUTION)
+    message("Adding the lsr-drop-solution=1 compiler flag")
+    add_compile_options("SHELL: -mllvm --lsr-drop-solution=1")
+  endif()
+endif()
 if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
  check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED)
  if(HAS_ENABLE_POST_MISCHED)
@@ -307,7 +321,6 @@ link_libraries(${OpenMP_gomp_LIBRARY})
 link_libraries(${OpenMP_pthread_LIBRARY})

 ## HIP
-find_package(HIP REQUIRED)
 # Override HIP version in config.h, if necessary.
 # The variables set by find_package() can't be overwritten,
 # therefore let's use intermediate variables.

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -795,6 +795,10 @@ pipeline {
            name: "RUN_GROUPED_CONV_LARGE_CASES_TESTS",
            defaultValue: false,
            description: "Run the grouped conv large cases tests (default: OFF)")
+        booleanParam(
+            name: "RUN_CODEGEN_TESTS",
+            defaultValue: false,
+            description: "Run codegen tests (default: OFF)")
        booleanParam(
            name: "RUN_CK_TILE_FMHA_TESTS",
            defaultValue: false,
@@ -923,6 +927,29 @@ pipeline {
                }
            }
        }
+        stage("Run Codegen Tests")
+        {
+            parallel
+            {
+                stage("Run Codegen Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CODEGEN_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a")}
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen && \
+                                           make -j64 check"""
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+            }
+        }
        stage("Run CK_TILE_FMHA Tests")
        {
            parallel
@@ -1063,11 +1090,11 @@ pipeline {
                    agent{ label rocmnode("gfx90a") }
                    environment{
                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \
-                                         -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \
+                                         -DGPU_TARGETS="gfx908;gfx90a;gfx942" \
                                         -DCMAKE_CXX_FLAGS=" -O3 " """
                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \
+                                           -DGPU_TARGETS="gfx908;gfx90a;gfx942" \
                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                    }
@@ -1127,7 +1154,7 @@ pipeline {
                        execute_args = """ cmake -D CMAKE_PREFIX_PATH=/opt/rocm \
                                           -D CMAKE_CXX_COMPILER="${build_compiler()}" \
                                           -D CMAKE_BUILD_TYPE=Release \
-                                           -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"  \
+                                           -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"  \
                                           -D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
                    }
                    steps{

--- a/README.md
+++ b/README.md
 # Composable Kernel

+> [!NOTE]
+> The published documentation is available at [Composable Kernel](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
+
 The Composable Kernel (CK) library provides a programming model for writing performance-critical
 kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library
 uses general purpose kernel languages, such as HIP C++.
@@ -134,12 +137,11 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa

    You can find instructions for running ckProfiler in [profiler](/profiler).

-Note the `-j` option for building with multiple threads in parallel. This speeds up the build significantly.
+Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
+However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and
+crash. On average, you should expect each thread to use ~2Gb of RAM.
 Depending on the number of CPU cores and the amount of RAM on your system, you may want to
-limit the number of threads. For example, if you have a 128-core CPU and 64 Gb of RAM.
-
-By default, `-j` launches one thread per CPU core, which can cause the build to run out of memory and
-crash. In such cases, you can reduce the number of threads to 32 by using `-j32`.
+limit the number of threads. For example, if you have a 128-core CPU and 128 Gb of RAM it's advisable to use `-j32`.

 Additional cmake flags can be used to significantly speed-up the build:

@@ -151,6 +153,11 @@ Additional cmake flags can be used to significantly speed-up the build:
  `batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most
  other platforms have faster instances, such as `xdl` or `wmma`, available.

+* `CK_USE_FP8_ON_UNSUPPORTED_ARCH` (default is OFF) must be set to ON in order to build instances,
+  such as `gemm_universal` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not
+  have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on
+  architectures like the MI100/MI200 for the functional support only.
+
 ## Using sccache for building

 The default CK Docker images come with a pre-installed version of sccache, which supports clang

--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
+cmake_minimum_required(VERSION 3.16)
+project(composable_kernel_host)
+
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
@@ -5,30 +8,24 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)

-add_compile_options(-std=c++17)
-find_package(hip)
-add_custom_target(codegen)
+find_package(ROCM)
+include(ROCMInstallTargets)
+include(ROCMTest)

-# add include directories
-include_directories(BEFORE
-            ${PROJECT_BINARY_DIR}/include
-            ${PROJECT_SOURCE_DIR}/include
-            ${PROJECT_SOURCE_DIR}/library/include
-            ${HIP_INCLUDE_DIRS}
-            )
+rocm_setup_version(VERSION 1.0)

 list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake)
 include(Embed)
 file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
    ${CK_ROOT}/include/ck/*.hpp)
-#printouts fot debug purposes
-#message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
-#message(STATUS "RELATIVE: ${CK_ROOT}/include")
+# printouts fot debug purposes
+# message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
+# message(STATUS "RELATIVE: ${CK_ROOT}/include")
 add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)

-file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
+add_compile_options(-std=c++17)

-##message(STATUS "SOURCE_FILES: ${SOURCES}")
+file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
 # TODO: Use object library
 add_library(ck_host STATIC ${SOURCES})
 target_link_libraries(ck_host PRIVATE ck_headers)
@@ -37,24 +34,25 @@ set_target_properties(ck_host PROPERTIES
    LINKER_LANGUAGE CXX
    POSITION_INDEPENDENT_CODE ON)

-target_include_directories(ck_host PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-    $<INSTALL_INTERFACE:include>
-)
+# target_include_directories(ck_host PUBLIC
+#     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+# )

 add_executable(ck-template-driver driver/main.cpp)
 target_link_libraries(ck-template-driver ck_host)

-rocm_install(
+rocm_install_targets(
    TARGETS ck_host ck_headers
-    EXPORT ck_hostTargets
+    EXPORT ck_host_targets
+    INCLUDE include
+    PRIVATE
 )
-rocm_install(EXPORT ck_hostTargets 
-	FILE composable_kernelck_hostTargets.cmake 
+rocm_export_targets(
+    EXPORT ck_host_targets
    NAMESPACE composable_kernel::
-	DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel)
-rocm_install(DIRECTORY include/ck DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+)

 if(BUILD_TESTING)
    add_subdirectory(test)
 endif()
+
--- a/codegen/test/CMakeLists.txt
+++ b/codegen/test/CMakeLists.txt
 list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
 add_subdirectory(rtc)
 file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp)
-# do not build the tests when we build the library for various targets
-if(NOT GPU_ARCHS)
-  foreach(TEST_SRC ${TEST_SRCS})
-    set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP)
+
+# TODO: These tests need to be refactored to remove dependency on main ck
+# headers and device compilation.
+set(TESTS_REQUIRE_DEVICE_COMPILE 
+  grouped_conv_fwd_multiple_d_v1
+  grouped_conv_fwd_multiple_d_v2
+  grouped_conv_fwd_multiple_d_v3
+  grouped_conv_fwd_multiple_d_v4
+)
+find_package(hip)
+
+foreach(TEST_SRC ${TEST_SRCS})
  get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
-    add_executable(codegen_test_${BASE_NAME} ${TEST_SRC})
-    if(CK_USE_ALTERNATIVE_PYTHON)
-      target_link_options(codegen_test_${BASE_NAME} PRIVATE -lstdc++fs)
-    endif()
-    add_dependencies(codegen codegen_test_${BASE_NAME})
-    add_dependencies(tests codegen_test_${BASE_NAME})
-    add_dependencies(check codegen_test_${BASE_NAME})
-    add_test(NAME codegen_test_${BASE_NAME} COMMAND codegen_test_${BASE_NAME})
-    message("adding test codegen_test_${BASE_NAME}")
+  rocm_add_test_executable(codegen_test_${BASE_NAME} ${TEST_SRC})
  target_link_libraries(codegen_test_${BASE_NAME} ck_rtc ck_host)
-    target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/codegen/test/include)
+  target_include_directories(codegen_test_${BASE_NAME} PUBLIC include)
+  if(BASE_NAME IN_LIST TESTS_REQUIRE_DEVICE_COMPILE)
+    target_link_libraries(codegen_test_${BASE_NAME} hip::device)
    target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/include)
    target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/library/include)
-  endforeach()
-endif()
+  endif()
+endforeach()
--- a/codegen/test/common.hpp
+++ b/codegen/test/common.hpp
--- a/codegen/test/rtc/CMakeLists.txt
+++ b/codegen/test/rtc/CMakeLists.txt
+find_package(hip)
 file(GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp)
 add_library(ck_rtc ${RTC_SOURCES})
 target_include_directories(ck_rtc PUBLIC include)
 target_link_libraries(ck_rtc PUBLIC hip::host)
+target_link_libraries(ck_rtc PUBLIC -lstdc++fs)
--- a/codegen/test/rtc/include/rtc/compile_kernel.hpp
+++ b/codegen/test/rtc/include/rtc/compile_kernel.hpp
@@ -2,14 +2,14 @@
 #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL

 #include <rtc/kernel.hpp>
-#include <ck/filesystem.hpp>
+#include <rtc/filesystem.hpp>
 #include <string>

 namespace rtc {

 struct src_file
 {
-    CK::fs::path path;
+    fs::path path;
    std::string_view content;
 };


--- a/codegen/test/rtc/include/rtc/filesystem.hpp
+++ b/codegen/test/rtc/include/rtc/filesystem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
+#define GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
+
+#include <string>
+#include <string_view>
+
+// clang-format off
+#if defined(CPPCHECK)
+  #define RTC_HAS_FILESYSTEM 1
+  #define RTC_HAS_FILESYSTEM_TS 1
+#elif defined(_WIN32)
+  #if _MSC_VER >= 1920
+    #define RTC_HAS_FILESYSTEM 1
+    #define RTC_HAS_FILESYSTEM_TS 0
+  #elif _MSC_VER >= 1900
+    #define RTC_HAS_FILESYSTEM 0
+    #define RTC_HAS_FILESYSTEM_TS 1
+  #else
+    #define RTC_HAS_FILESYSTEM 0
+    #define RTC_HAS_FILESYSTEM_TS 0
+  #endif
+#elif defined(__has_include)
+  #if __has_include(<filesystem>) && __cplusplus >= 201703L
+    #define RTC_HAS_FILESYSTEM 1
+  #else
+    #define RTC_HAS_FILESYSTEM 0
+  #endif
+  #if __has_include(<experimental/filesystem>) && __cplusplus >= 201103L
+    #define RTC_HAS_FILESYSTEM_TS 1
+  #else
+    #define RTC_HAS_FILESYSTEM_TS 0
+  #endif
+#else
+  #define RTC_HAS_FILESYSTEM 0
+  #define RTC_HAS_FILESYSTEM_TS 0
+#endif
+// clang-format on
+
+#if RTC_HAS_FILESYSTEM
+#include <filesystem>
+#elif RTC_HAS_FILESYSTEM_TS
+#include <experimental/filesystem>
+#else
+#error "No filesystem include available"
+#endif
+
+namespace rtc {
+
+#if RTC_HAS_FILESYSTEM
+namespace fs = ::std::filesystem;
+#elif RTC_HAS_FILESYSTEM_TS
+namespace fs = ::std::experimental::filesystem;
+#endif
+
+} // namespace rtc
+
+#endif // GUARD_RTC_FILESYSTEM_HPP_
--- a/codegen/test/rtc/include/rtc/tmp_dir.hpp
+++ b/codegen/test/rtc/include/rtc/tmp_dir.hpp
@@ -2,13 +2,13 @@
 #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR

 #include <string>
-#include <ck/filesystem.hpp>
+#include <rtc/filesystem.hpp>

 namespace rtc {

 struct tmp_dir
 {
-    CK::fs::path path;
+    fs::path path;
    tmp_dir(const std::string& prefix = "");

    void execute(const std::string& cmd) const;

--- a/codegen/test/rtc/src/compile_kernel.cpp
+++ b/codegen/test/rtc/src/compile_kernel.cpp
-#include "rtc/hip.hpp"
+#include <rtc/hip.hpp>
 #include <rtc/compile_kernel.hpp>
 #include <rtc/tmp_dir.hpp>
 #include <stdexcept>
@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options

    for(const auto& src : srcs)
    {
-        CK::fs::path full_path   = td.path / src.path;
-        CK::fs::path parent_path = full_path.parent_path();
-        CK::fs::create_directories(parent_path);
+        fs::path full_path   = td.path / src.path;
+        fs::path parent_path = full_path.parent_path();
+        fs::create_directories(parent_path);
        write_string(full_path.string(), src.content);
        if(src.path.extension().string() == ".cpp")
        {
@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
    td.execute(compiler() + options.flags);

    auto out_path = td.path / out;
-    if(not CK::fs::exists(out_path))
+    if(not fs::exists(out_path))
        throw std::runtime_error("Output file missing: " + out);

    auto obj = read_buffer(out_path.string());

--- a/codegen/test/rtc/src/tmp_dir.cpp
+++ b/codegen/test/rtc/src/tmp_dir.cpp
@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix)
 }

 tmp_dir::tmp_dir(const std::string& prefix)
-    : path(CK::fs::temp_directory_path() /
+    : path(fs::temp_directory_path() /
           unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix))
 {
-    CK::fs::create_directories(this->path);
+    fs::create_directories(this->path);
 }

 void tmp_dir::execute(const std::string& cmd) const
@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const
    std::system(s.c_str());
 }

-tmp_dir::~tmp_dir() { CK::fs::remove_all(this->path); }
+tmp_dir::~tmp_dir() { fs::remove_all(this->path); }

 } // namespace rtc
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==1.8.2
+rocm-docs-core==1.8.3
 sphinxcontrib-bibtex==2.6.3
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.8.2
+rocm-docs-core==1.8.3
    # via -r requirements.in
 six==1.16.0
    # via pybtex

--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -29,9 +29,9 @@ struct ProblemSize final
    ck::index_t N = 4096;
    ck::index_t K = 4096;

-    ck::index_t StrideA = 0;
-    ck::index_t StrideB = 0;
-    ck::index_t StrideC = 0;
+    ck::index_t StrideA = -1;
+    ck::index_t StrideB = -1;
+    ck::index_t StrideC = -1;
 };

 struct ProblemSizeStreamK final
@@ -40,9 +40,9 @@ struct ProblemSizeStreamK final
    ck::index_t N = 4096;
    ck::index_t K = 4096;

-    ck::index_t StrideA = 0;
-    ck::index_t StrideB = 0;
-    ck::index_t StrideC = 0;
+    ck::index_t StrideA = -1;
+    ck::index_t StrideB = -1;
+    ck::index_t StrideC = -1;

    ck::index_t NumSKBlocks = -1;
 };
@@ -52,9 +52,9 @@ struct ProblemSizeStreamK_universal final
    ck::index_t N = 4096;
    ck::index_t K = 4096;

-    ck::index_t StrideA = 0;
-    ck::index_t StrideB = 0;
-    ck::index_t StrideC = 0;
+    ck::index_t StrideA = -1;
+    ck::index_t StrideB = -1;
+    ck::index_t StrideC = -1;

    ck::index_t Grid_size   = -1; // defaults to max occupancy
    ck::index_t Streamk_sel = 1;  // defaults to 1-tile SK
@@ -66,16 +66,17 @@ struct ProblemSizeSplitK final
    ck::index_t N = 4096;
    ck::index_t K = 4096;

-    ck::index_t StrideA = 0;
-    ck::index_t StrideB = 0;
-    ck::index_t StrideC = 0;
+    ck::index_t StrideA = -1;
+    ck::index_t StrideB = -1;
+    ck::index_t StrideC = -1;

    ck::index_t KBatch = 1;
 };

 struct ExecutionConfig final
 {
-    bool do_verification = true;
+    // 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU
+    int do_verification = 3;
    int init_method     = 2;
    bool time_kernel    = false;
 };
@@ -126,7 +127,7 @@ bool parse_cmd_args<ProblemSize>(int argc,
    }
    else
    {
-        std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+        std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
                  << std::endl
                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
@@ -176,7 +177,7 @@ bool parse_cmd_args<ProblemSizeStreamK_universal>(int argc,
    else
    {
        std::cerr
-            << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+            << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
            << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl
            << "arg3: time kernel (0=no, 1=yes)" << std::endl
            << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
@@ -225,7 +226,7 @@ bool parse_cmd_args<ProblemSizeStreamK>(int argc,
    }
    else
    {
-        std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+        std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
                  << std::endl
                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
@@ -275,7 +276,7 @@ bool parse_cmd_args<ProblemSizeSplitK>(int argc,
    }
    else
    {
-        std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+        std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
                  << std::endl
                  << "arg3: time kernel (0=no, 1=yes)" << std::endl

--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -116,21 +116,21 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        };

    auto f_get_default_stride =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(stride == 0)
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
            {
-                // give a chance if stride is zero, return a default packed stride
+                // give a chance if stride is -1, return a default packed stride
                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
                {
-                    return col;
+                    return static_cast<std::size_t>(col);
                }
                else
                {
-                    return row;
+                    return static_cast<std::size_t>(row);
                }
            }
            else
-                return stride;
+                return static_cast<std::size_t>(stride);
        };

    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
@@ -330,7 +330,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)

    bool pass = true;

-    if(config.do_verification)
+    if((config.do_verification == 1) || (config.do_verification == 3))
    {
        // CPU verification
        auto ref_gemm    = ReferenceGemmInstance{};
@@ -353,13 +353,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 #else
        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());

-        pass &= !ck::utils::check_err(c_m_n_device_result,
+        pass &= ck::utils::check_err(c_m_n_device_result,
                                     c_m_n_host_result,
                                     "Error: Incorrect results!",
                                     get_rtol<CDataType>(),
                                     get_atol<CDataType>());
 #endif
+    }

+    if((config.do_verification == 2) || (config.do_verification == 3))
+    {
        // GPU verification
        auto ref_gemm_gpu    = ReferenceGemmInstanceGPU{};
        auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker();
@@ -381,14 +384,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());

-        pass &= !ck::utils::check_err(c_m_n_device_result,
+        pass &= ck::utils::check_err(c_m_n_device_result,
                                     c_m_n_device_ref_result,
                                     "Error: Incorrect results!",
                                     get_rtol<CDataType>(),
                                     get_atol<CDataType>());
    }

-    return !pass;
+    return pass == true;
 }

 bool run_gemm_example(int argc, char* argv[])

--- a/example/01_gemm/run_gemm_example_streamk_v2.inc
+++ b/example/01_gemm/run_gemm_example_streamk_v2.inc
@@ -117,9 +117,9 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)

    auto f_get_default_stride =
        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
-            if(stride == 0)
+            if(stride == -1)
            {
-                // give a chance if stride is 0, return a default packed stride
+                // give a chance if stride is -1, return a default packed stride
                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
                {
                    return static_cast<std::size_t>(col);
@@ -241,7 +241,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    }

    bool pass = true;
-    if(config.do_verification)
+    if((config.do_verification == 1) || (config.do_verification == 3))
    {
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();

--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
@@ -115,21 +115,21 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        };

    auto f_get_default_stride =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(stride == 0)
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
            {
-                // give a chance if stride is zero, return a default packed stride
+                // give a chance if stride is -1, return a default packed stride
                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
                {
-                    return col;
+                    return static_cast<std::size_t>(col);
                }
                else
                {
-                    return row;
+                    return static_cast<std::size_t>(row);
                }
            }
            else
-                return stride;
+                return static_cast<std::size_t>(stride);
        };

    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
@@ -228,7 +228,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    }

    bool pass = true;
-    if(config.do_verification)
+    if((config.do_verification == 1) || (config.do_verification == 3))
    {
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();

--- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
+++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 /*
 Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o]
@@ -60,14 +60,14 @@ struct AddAddRelu
    {
        const ck::half_t x = c + d0 + d1;

-        ck::tensor_operation::element_wise::Relu{}.template operator()<ck::half_t>(e, x);
+        ck::tensor_operation::element_wise::Relu{}.operator()(e, x);
    }
    __host__ __device__ void
    operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const
    {
        const float x = c + (d0 + d1);

-        ck::tensor_operation::element_wise::Relu{}.template operator()<float>(e, x);
+        ck::tensor_operation::element_wise::Relu{}.operator()(e, x);
    }
 };


--- a/example/44_elementwise_permute/CMakeLists.txt
+++ b/example/44_elementwise_permute/CMakeLists.txt
@@ -5,3 +5,4 @@ add_example_executable(example_elementwise_permute_4D_fp32_col elementwise_permu
 add_example_executable(example_elementwise_permute_4D_fp16_col elementwise_permute_4D_fp16_col.cpp)
 add_example_executable(example_elementwise_binary_4D_fp16 elementwise_binary_4D_fp16.cpp)
 add_example_executable(example_elementwise_trinary_4D_fp16 elementwise_trinary_4D_fp16.cpp)
+add_example_executable(elementwise_scale_permute_amax_2D_fp16_fp8 elementwise_scale_permute_amax_2D_fp16_fp8.cpp)