Merge branch 'develop' into gemm_bf16_sk_muozturk

965021d2 · M.Emin Ozturk · GitHub · 8a34c640 · 7d8ea5f0 · 965021d2
Unverified Commit 965021d2 authored Oct 07, 2024 by M.Emin Ozturk Committed by GitHub Oct 07, 2024
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,11 +98,6 @@ if(DL_KERNELS)
    set(CK_ENABLE_DL_KERNELS "ON")
 endif()

-if(INSTANCES_ONLY)
-    add_definitions(-DINSTANCES_ONLY)
-    set(CK_ENABLE_INSTANCES_ONLY "ON")
-endif()
-
 include(getopt)

 # CK version file to record release version as well as git commit hash
@@ -127,6 +122,12 @@ rocm_setup_version(VERSION ${version})
 list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip "$ENV{ROCM_PATH}" "$ENV{HIP_PATH}")

 message("GPU_TARGETS= ${GPU_TARGETS}")
+message("GPU_ARCHS= ${GPU_ARCHS}")
+if(GPU_ARCHS)
+    #disable GPU_TARGETS to avoid conflicts, this needs to happen before we call hip package
+    unset(GPU_TARGETS CACHE)
+    unset(AMDGPU_TARGETS CACHE)
+endif()

 find_package(hip)
 # No assumption that HIP kernels are launched with uniform block size for backward compatibility
@@ -135,55 +136,38 @@ math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR})
 message("hip_version_flat=${hip_VERSION_FLAT}")

 message("checking which targets are supported")
-#This is the list of targets to be used in case GPU_TARGETS is not set on command line
-#These targets will be filtered and only supported ones will be used
-#Setting GPU_TARGETS on command line will override this list
-if(NOT PROFILER_ONLY)
-    if(NOT ENABLE_ASAN_PACKAGING)
-        #build CK for all supported targets
-        if(NOT WIN32 AND ${hip_VERSION_FLAT} LESS 600300000)
-            # WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above
-            rocm_check_target_ids(DEFAULT_GPU_TARGETS
-                TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
-        else()
-            rocm_check_target_ids(DEFAULT_GPU_TARGETS
-                TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
-        endif()
+#In order to build just the CK library (without tests and examples) for all supported GPU targets
+#use -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" 
+#the GPU_TARGETS flag will be reset in this case in order to avoid conflicts.
+#
+#In order to build CK along with all tests and examples it should be OK to set GPU_TARGETS to just 1 or 2 similar architectures.
+if(NOT ENABLE_ASAN_PACKAGING)
+    if(NOT WIN32 AND ${hip_VERSION_FLAT} LESS 600300000)
+        # WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above
+        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
    else()
-        #build CK only for xnack-supported targets
-        rocm_check_target_ids(DEFAULT_GPU_TARGETS
-            TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx940:xnack+;gfx941:xnack+;gfx942:xnack+")
-        set(GPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " " FORCE)
+        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
    endif()
 else()
-    add_definitions(-DPROFILER_ONLY)
-    set(GPU_TARGETS "" CACHE STRING "" FORCE)
+    #build CK only for xnack-supported targets when using ASAN
+    set(CK_GPU_TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx940:xnack+;gfx941:xnack+;gfx942:xnack+")
+endif()
+
+#if user set GPU_ARCHS on the cmake command line, overwrite default target list with user's list
+#otherwise, if user set GPU_TARGETS, use that set of targets
+if(GPU_ARCHS)
+    set(CK_GPU_TARGETS ${GPU_ARCHS})
+else()
    if(GPU_TARGETS)
-        message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx90, gfx94, gfx10, gfx11 or gfx12")
-    endif()
-    if(GPU_ARCH MATCHES "gfx90")
-        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx908;gfx90a")
-    elseif(GPU_ARCH MATCHES "gfx94")
-        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx940;gfx941;gfx942")
-    elseif(GPU_ARCH MATCHES "gfx10")
-        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1030")
-    elseif(GPU_ARCH MATCHES "gfx11")
-        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1100;gfx1101;gfx1102")
-    elseif(GPU_ARCH MATCHES "gfx12")
-        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1200;gfx1201")
-    else()
-        message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx90, gfx94, gfx10, gfx11 or gfx12")
+        set(CK_GPU_TARGETS ${GPU_TARGETS})
    endif()
-    set(GPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " " FORCE)
 endif()

-message("Supported GPU_TARGETS= ${DEFAULT_GPU_TARGETS}")
+#make sure all the targets on the list are actually supported by the current compiler
+rocm_check_target_ids(SUPPORTED_GPU_TARGETS
+        TARGETS ${CK_GPU_TARGETS})

-if(GPU_TARGETS)
-    message("Building CK for the following targets: ${GPU_TARGETS}")
-else()
-    message("Building CK for the default targets: ${DEFAULT_GPU_TARGETS}")
-endif()
+message("Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}")

 if (GPU_TARGETS)
    if (GPU_TARGETS MATCHES "gfx9")
@@ -557,8 +541,7 @@ ENDFOREACH()
 add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES}  SOURCES ${INSTANCE_FILES})
 add_subdirectory(library)

-if(NOT DEFINED INSTANCES_ONLY)
- if(NOT DEFINED PROFILER_ONLY)
+if(NOT GPU_ARCHS)
   rocm_package_setup_component(tests
        LIBRARY_NAME composablekernel
        PACKAGE_NAME tests # Prevent -static suffix on package name
@@ -569,24 +552,18 @@ if(NOT DEFINED INSTANCES_ONLY)
        PACKAGE_NAME examples
   )
   add_subdirectory(example)
-   add_subdirectory(test)
-
-   rocm_package_setup_component(profiler
-        LIBRARY_NAME composablekernel
-        PACKAGE_NAME ckprofiler
-   )
-   add_subdirectory(profiler)
-  else()
-    #When building PROFILER_ONLY, label the package with GPU_ARCH
-    rocm_package_setup_component(profiler
-       LIBRARY_NAME composablekernel
-       PACKAGE_NAME ckprofiler_${GPU_ARCH}
-    )
-    add_subdirectory(profiler)
-  endif()
+   if(BUILD_TESTING)
+	   add_subdirectory(test)
+   endif()
 endif()

-if(NOT DEFINED PROFILER_ONLY AND (GPU_TARGETS MATCHES "gfx9" OR DEFINED INSTANCES_ONLY))
+rocm_package_setup_component(profiler
+    LIBRARY_NAME composablekernel
+    PACKAGE_NAME ckprofiler
+)
+add_subdirectory(profiler)
+
+if(GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS)
  add_subdirectory(codegen)
 endif()


--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1138,8 +1138,8 @@ pipeline {
                        execute_args = """ cmake -D CMAKE_PREFIX_PATH=/opt/rocm \
                                           -D CMAKE_CXX_COMPILER="${build_compiler()}" \
                                           -D CMAKE_BUILD_TYPE=Release \
-                                           -D INSTANCES_ONLY=ON \
-                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
+                                           -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"  \
+                                           -D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
                   }
                    steps{
                        buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args)

--- a/README.md
+++ b/README.md
@@ -90,7 +90,12 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
    ```

    If you don't set `GPU_TARGETS` on the cmake command line, CK is built for all GPU targets
-    supported by the current compiler (this may take a long time).
+    supported by the current compiler (this may take a long time). 
+
+    NOTE: If you try setting `GPU_TARGETS` to a list of architectures, the build will only work if the 
+    architectures are similar, e.g., `gfx908;gfx90a`, or `gfx1100;gfx1101;gfx11012`. Otherwise, if you 
+    want to build the library for a list of different architectures,
+    you should use the `GPU_ARCHS` build argument, for example `GPU_ARCHS=gfx908;gfx1030;gfx1100;gfx942`.

 4. Build the entire CK library:

@@ -137,10 +142,6 @@ crash. In such cases, you can reduce the number of threads to 32 by using `-j32`

 Additional cmake flags can be used to significantly speed-up the build:

-* `INSTANCES_ONLY` (default is OFF) must be set to ON in order to build only the instances and library
-  while skipping all tests, examples, and profiler. This is useful in cases when you plan to use CK as a
-  dependency and don't plan to run any examples or tests.
-
 * `DTYPES` (default is not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build
  instances of select data types only. The main default data types are fp32 and fp16; you can safely skip
  other data types.

--- a/cmake/Embed.cmake
+++ b/cmake/Embed.cmake
@@ -233,6 +233,8 @@ function(add_embed_library EMBED_NAME)
    else()
        target_sources(${EMBED_NAME} INTERFACE $<TARGET_OBJECTS:${INTERNAL_EMBED_LIB}>)
    endif()
-    target_include_directories(${EMBED_NAME} INTERFACE "${EMBED_DIR}/include")
+    target_include_directories(${EMBED_NAME} INTERFACE 
+	    $<BUILD_INTERFACE:${EMBED_DIR}/include>
+	        $<INSTALL_INTERFACE:include/ck>)
 endfunction()

--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -39,6 +39,7 @@ set_target_properties(ck_host PROPERTIES

 target_include_directories(ck_host PUBLIC
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:include>
 )

 add_executable(ck-template-driver driver/main.cpp)
@@ -48,6 +49,12 @@ rocm_install(
    TARGETS ck_host ck_headers
    EXPORT ck_hostTargets
 )
+rocm_install(EXPORT ck_hostTargets 
+	FILE composable_kernelck_hostTargets.cmake 
+	NAMESPACE composable_kernel:: 
+	DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel)
 rocm_install(DIRECTORY include/ck DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

-add_subdirectory(test)
+if(BUILD_TESTING)
+	add_subdirectory(test)
+endif()
--- a/codegen/test/CMakeLists.txt
+++ b/codegen/test/CMakeLists.txt
 list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
 add_subdirectory(rtc)
 file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp)
-if(NOT INSTANCES_ONLY)
+# do not build the tests when we build the library for various targets
+if(NOT GPU_ARCHS)
  foreach(TEST_SRC ${TEST_SRCS})
    set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP)
    get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)

--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -45,11 +45,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
    endforeach()
    endif()

-    if(INSTANCES_ONLY)
-        set(EX_TARGETS ${DEFAULT_GPU_TARGETS})
-    else()
-        set(EX_TARGETS ${GPU_TARGETS})
-    endif()
+    set(EX_TARGETS ${SUPPORTED_GPU_TARGETS})

    #Do not build any DL examples if DL_KERNELS not set
    foreach(source IN LISTS FILE_NAME)
@@ -147,11 +143,8 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    endforeach()
    endif()

-    if(INSTANCES_ONLY)
-        set(EX_TARGETS ${DEFAULT_GPU_TARGETS})
-    else()
-        set(EX_TARGETS ${GPU_TARGETS})
-    endif()
+    set(EX_TARGETS ${SUPPORTED_GPU_TARGETS})
+
    #Do not build any DL examples if DL_KERNELS not set
    foreach(source IN LISTS FILE_NAME)
        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")

--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -70,8 +70,13 @@ args:
       -seed    random seed used for initializing input tensors. 0 for non-deterministic seed (default:11939)
     -warmup    number of iterations before benchmark the kernel (default:5)
     -repeat    number of iterations to benchmark the kernel (default:20)
+  -drop_seed    seed for the random number generator for the dropout layer, default is 1
+-drop_offset    offset for the dropout layer which is used during random number generation, default is 0
+ -drop_prefs    flag to indicate `drop_seed` and `drop_offset` values if present on the GPU, default is 0, 0 - host, 1 - GPU
 ```
-Example: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case.
+Example 1: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case.
+Example 2: `./bin/tile_example_fmha_fwd -b=1 -h=8 -s=16384 -d=64 -drop_prefs=1 -drop_seed=10 -drop_offset=1234` will run a fmha case with 
+  batch=1, nhead=8, sequence length=16384, hdim=64, drop_seed=0 (in GPU memory), drop_offset=1234 (in GPU memory) fp16 case

 ## support features
 Currently we are still in rapid development stage, so more features/optimizations will be coming soon.

--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
@@ -85,6 +85,9 @@ auto create_args(int argc, char* argv[])
        .insert("p_drop", "0", "0~1 probability of dropout")
        .insert("drop_seed", "1", "seed for random number generator")
        .insert("drop_offset", "0", "offset for random number generator")
+        .insert("drop_prefs",
+                "0",
+                "seed and offset values are present on GPU; 0 - host, 1 - device/GPU")
        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
        .insert("warmup", "5", "number of iterations before benchmark the kernel")
        .insert("repeat", "20", "number of iterations to benchmark the kernel")
@@ -158,6 +161,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    float p_drop         = arg_parser.get_float("p_drop");
    uint64_t drop_seed   = arg_parser.get_uint64("drop_seed");
    uint64_t drop_offset = arg_parser.get_uint64("drop_offset");
+    bool drop_prefs      = arg_parser.get_bool("drop_prefs");
+
    if(use_dbias && bias.type != bias_enum::elementwise_bias)
    {
        std::cerr << "dbias only exists when bias type is elementwise" << std::endl;
@@ -381,6 +386,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::DeviceMem dbias_buf(dbias_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t));
    ck_tile::DeviceMem seqstart_k(seqstart_k_host.size() * sizeof(int32_t));
+    ck_tile::DeviceMem drop_seed_buf(drop_prefs ? sizeof(uint64_t) : 0);
+    ck_tile::DeviceMem drop_offset_buf(drop_prefs ? sizeof(uint64_t) : 0);
    ck_tile::DeviceMem alibi_slope_buf(alibi_slope_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem dq_acc_buf(dq_acc_host.get_element_space_size_in_bytes());

@@ -391,6 +398,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    do_buf.ToDevice(do_host.data());
    seqstart_q.ToDevice(seqstart_q_host.data());
    seqstart_k.ToDevice(seqstart_k_host.data());
+    drop_seed_buf.ToDevice(drop_prefs ? &drop_seed : nullptr);
+    drop_offset_buf.ToDevice(drop_prefs ? &drop_offset : nullptr);
    alibi_slope_buf.ToDevice(alibi_slope_host.data());

    // clang-format off
@@ -472,6 +481,18 @@ bool run(const ck_tile::ArgParser& arg_parser)
        const ck_tile::index_t split_stride_dq_acc =
            (shape_batch * nhead * shape_seqlen_q * hdim_q);

+        const auto drop_seed_offset = [&]() -> decltype(fmha_bwd_args::drop_seed_offset) {
+            if(drop_prefs)
+            {
+                return std::make_pair(drop_seed_buf.GetDeviceBuffer(),
+                                      drop_offset_buf.GetDeviceBuffer());
+            }
+            else
+            {
+                return std::make_pair(drop_seed, drop_offset);
+            }
+        }();
+
        return fmha_bwd_args{q_buf.GetDeviceBuffer(),
                             k_buf.GetDeviceBuffer(),
                             v_buf.GetDeviceBuffer(),
@@ -545,7 +566,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             static_cast<ck_tile::index_t>(mask.type),
                             p_drop,
                             p_undrop,
-                             {drop_seed, drop_offset}};
+                             drop_seed_offset};
    }();

    float ave_time = fmha_bwd(fmha_traits, fmha_args, stream_config);

--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -9,7 +9,10 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "mask.hpp"
 #include "bias.hpp"
+
 #include <type_traits>
+#include <utility>
+#include <variant>

 template <typename DataType>
 struct FmhaBwdTypeConfig;
@@ -135,7 +138,8 @@ struct fmha_bwd_args
    ck_tile::index_t mask_type;
    float p_drop;
    float p_undrop;
-    std::tuple<uint64_t, uint64_t> drop_seed_offset;
+    std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+        drop_seed_offset;
 };

 template <typename FmhaBwdDQDKDVKernel>

--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -122,6 +122,9 @@ auto create_args(int argc, char* argv[])
        .insert("p_drop", "0", "0~1 probability of dropout")
        .insert("drop_seed", "1", "seed for random number generator")
        .insert("drop_offset", "0", "offset for random number generator")
+        .insert("drop_prefs",
+                "0",
+                "seed and offset values are present on GPU; 0 - host, 1 - device/GPU")
        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
        .insert(
            "rotary_dim", "0", "RoPE rotary dimension. rotary_dim <= 0 means not apply RoPE at all")
@@ -442,6 +445,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    float p_drop         = arg_parser.get_float("p_drop");
    uint64_t drop_seed   = arg_parser.get_uint64("drop_seed");
    uint64_t drop_offset = arg_parser.get_uint64("drop_offset");
+    bool drop_prefs      = arg_parser.get_bool("drop_prefs");
+
    if(p_drop < 0.0f || p_drop > 1.0f)
    {
        std::cerr << "The value of p_drop should be 0~1" << std::endl;
@@ -756,6 +761,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
        need_append_kvcache ? cache_seqlen_ks.size() * sizeof(int32_t) : 0);
    ck_tile::DeviceMem rotary_cos_buf(rotary_cos_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem rotary_sin_buf(rotary_sin_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem drop_seed_buf(drop_prefs ? sizeof(uint64_t) : 0);
+    ck_tile::DeviceMem drop_offset_buf(drop_prefs ? sizeof(uint64_t) : 0);
    ck_tile::DeviceMem randval_buf(randval_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem alibi_slope_buf(alibi_slope_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem block_table_buf(block_table_host.get_element_space_size_in_bytes());
@@ -774,6 +781,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    cache_seqlen_k_buf.ToDevice(need_append_kvcache ? cache_seqlen_ks.data() : nullptr);
    rotary_cos_buf.ToDevice(rotary_cos_host.data());
    rotary_sin_buf.ToDevice(rotary_sin_host.data());
+    drop_seed_buf.ToDevice(drop_prefs ? &drop_seed : nullptr);
+    drop_offset_buf.ToDevice(drop_prefs ? &drop_offset : nullptr);
    alibi_slope_buf.ToDevice(alibi_slope_host.data());
    block_table_buf.ToDevice(block_table_host.data());
    cache_batch_idx_buf.ToDevice(cache_batch_idx_host.data());
@@ -1013,9 +1022,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
                args.nhead_stride_randval = nhead_stride_randval;
                args.batch_stride_randval = batch_stride_randval;

-                args.p_drop           = p_drop;
-                args.s_randval        = s_randval;
-                args.drop_seed_offset = std::tie(drop_seed, drop_offset);
+                args.p_drop    = p_drop;
+                args.s_randval = s_randval;
+                if(drop_prefs)
+                {
+                    args.drop_seed_offset = std::make_pair(drop_seed_buf.GetDeviceBuffer(),
+                                                           drop_offset_buf.GetDeviceBuffer());
+                }
+                else
+                {
+                    args.drop_seed_offset = std::make_pair(drop_seed, drop_offset);
+                }
            }
            else if constexpr(std::is_same_v<fmha_fwd_splitkv_args, std::decay_t<decltype(args)>>)
            {

--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -13,6 +13,8 @@
 #include "rotary.hpp"

 #include <type_traits>
+#include <utility>
+#include <variant>

 template <typename DataType>
 struct FmhaFwdTypeConfig;
@@ -144,7 +146,9 @@ struct fmha_fwd_args

    float p_drop;
    bool s_randval;
-    std::tuple<uint64_t, uint64_t> drop_seed_offset;
+
+    std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+        drop_seed_offset;
 };

 struct fmha_fwd_splitkv_args

--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
@@ -35,7 +35,9 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
                                                                    YDataType,
                                                                    MeanDataType,
                                                                    InvStdDataType,
-                                                                    Shape>;
+                                                                    Shape,
+                                                                    true,
+                                                                    true>;

        using Kernel = ck_tile::Layernorm2dFwd<PipelineProblem>;


--- a/include/ck/config.h.in
+++ b/include/ck/config.h.in
@@ -97,13 +97,6 @@
 #cmakedefine CK_ENABLE_DL_KERNELS @CK_ENABLE_DL_KERNELS@
 #endif

-//
-// Instances supports in the current CK build
-//
-#ifndef CK_ENABLE_INSTANCES_ONLY
-#cmakedefine CK_ENABLE_INSTANCES_ONLY @CK_ENABLE_INSTANCES_ONLY@
-#endif
-
 //
 // CK kernels which support XDL (MI series)
 //

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp
@@ -308,7 +308,7 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                                typename vector_type<ComputeDataType,
                                                     xdlops_gemm.K1PerXdlops>::type;

-                            xdlops_gemm.template Run(
+                            xdlops_gemm.template Run<>(
                                a_thread_vec.template AsType<mfma_input_type>(),
                                b_thread_vec.template AsType<mfma_input_type>(),
                                c_thread_buf_per_scale.GetVectorTypeReference(I0));
@@ -390,9 +390,10 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                        using mfma_input_type =
                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;

-                        xdlops_gemm.template Run(a_thread_vec.template AsType<mfma_input_type>(),
-                                                 b_thread_vec.template AsType<mfma_input_type>(),
-                                                 c_thread_buf_per_scale.GetVectorTypeReference(I0));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(I0));
                    });
                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
                        constexpr index_t c_offset =

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp
@@ -350,7 +350,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                                    typename vector_type<ComputeDataType,
                                                         xdlops_gemm.K1PerXdlops>::type;

-                                xdlops_gemm.template Run(
+                                xdlops_gemm.template Run<>(
                                    a_thread_vec.template AsType<mfma_input_type>(),
                                    b_thread_vec.template AsType<mfma_input_type>(),
                                    c_thread_buf_per_scale.GetVectorTypeReference(I0));
@@ -443,7 +443,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                                typename vector_type<ComputeDataType,
                                                     xdlops_gemm.K1PerXdlops>::type;

-                            xdlops_gemm.template Run(
+                            xdlops_gemm.template Run<>(
                                a_thread_vec.template AsType<mfma_input_type>(),
                                b_thread_vec.template AsType<mfma_input_type>(),
                                c_thread_buf_per_scale.GetVectorTypeReference(I0));
@@ -518,9 +518,10 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                        using mfma_input_type =
                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;

-                        xdlops_gemm.template Run(a_thread_vec.template AsType<mfma_input_type>(),
-                                                 b_thread_vec.template AsType<mfma_input_type>(),
-                                                 c_thread_buf_per_scale.GetVectorTypeReference(I0));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(I0));
                    });
                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
                        constexpr index_t c_offset =
@@ -575,9 +576,10 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                        using mfma_input_type =
                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;

-                        xdlops_gemm.template Run(a_thread_vec.template AsType<mfma_input_type>(),
-                                                 b_thread_vec.template AsType<mfma_input_type>(),
-                                                 c_thread_buf_per_scale.GetVectorTypeReference(I0));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(I0));
                    });
                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
                        constexpr index_t c_offset =

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
@@ -427,7 +427,7 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
                                typename vector_type<ComputeDataType,
                                                     xdlops_gemm.K1PerXdlops>::type;

-                            xdlops_gemm.template Run(
+                            xdlops_gemm.template Run<>(
                                a_thread_vec.template AsType<mfma_input_type>(),
                                b_thread_vec.template AsType<mfma_input_type>(),
                                c_thread_buf_per_scale.GetVectorTypeReference(I0));
@@ -504,9 +504,10 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
                        using mfma_input_type =
                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;

-                        xdlops_gemm.template Run(a_thread_vec.template AsType<mfma_input_type>(),
-                                                 b_thread_vec.template AsType<mfma_input_type>(),
-                                                 c_thread_buf_per_scale.GetVectorTypeReference(I0));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(I0));
                    });
                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
                        constexpr index_t c_offset =

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp
@@ -64,7 +64,7 @@ __global__ void
    const index_t N = gemm_desc_ptr[group_id].N;
    const index_t K = gemm_desc_ptr[group_id].K;

-    if(M * N * K == 0)
+    if(M == 0 || N == 0 || K == 0)
        return;

    const auto StrideAs = gemm_desc_ptr[group_id].StrideAs;

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
@@ -345,7 +345,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
                const index_t N = gemm_descs[i].N_;
                const index_t K = gemm_descs[i].K_;

-                if(M * N * K == 0)
+                if(M == 0 || N == 0 || K == 0)
                {
                    skipped_group_count_++;
                    continue;

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
@@ -109,7 +109,7 @@ __global__ void
            N = gemm_desc_ptr[group_id].N;
            K = gemm_desc_ptr[group_id].K;

-            if(M * N * K == 0)
+            if(M == 0 || N == 0 || K == 0)
            {
                grid_size_grp = 0;
                continue;