Merge branch 'develop' into gfx950

ef5e60f6 · illsilin · 2cc0fa26 · 5e93fa9e · ef5e60f6 · ef5e60f6
Commit ef5e60f6 authored Dec 11, 2024 by illsilin
20 changed files
--- a/example/62_convnd_activ/binary/CMakeLists.txt
+++ b/example/62_convnd_activ/binary/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/convinvscale/CMakeLists.txt
+++ b/example/62_convnd_activ/convinvscale/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/convscale/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp
+++ b/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp
@@ -172,12 +172,13 @@ bool run_grouped_conv_fwd(bool do_verification,
    {
    case 0: break;
    case 1:
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        // values generated: -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5
-        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 6});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1.0, 1.0});
        break;
    default:
-        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0});
-        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1.0, 1.0});
    }
    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());

--- a/example/62_convnd_activ/convscale_add/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_add/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/convscale_reduce/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_reduce/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/convscale_relu/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_relu/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
+++ b/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/multi_AB/CMakeLists.txt
+++ b/example/62_convnd_activ/multi_AB/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/unary/CMakeLists.txt
+++ b/example/62_convnd_activ/unary/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -94,7 +94,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
        if(FILE_NAME MATCHES "_xdl")
            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
        elseif(FILE_NAME MATCHES "_wmma")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx950)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
        endif()
        set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
        add_executable(${EXAMPLE_NAME} ${FILE_NAME})
@@ -178,7 +178,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
        if(FILE_NAME MATCHES "_xdl")
            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
        elseif(FILE_NAME MATCHES "_wmma")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx950)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
        endif()
        set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
        add_executable(${EXAMPLE_NAME} ${FILE_NAME})

--- a/example/README.md
+++ b/example/README.md
+[Back to the main page](../README.md)
+# Composable Kernel examples
\ No newline at end of file
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -247,13 +247,23 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const
 }}
 """
-FMHA_FWD_SPLITKV_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse}) && (t.do_fp8_static_quant == {F_squant}) &&
+FMHA_FWD_SPLITKV_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.do_fp8_static_quant == {F_squant}) &&
                        ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
-                using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, true, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
-                using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>;
+                if (t.has_lse) {{
+                    if constexpr (std::is_same_v<{F_dtype}, ck_tile::fp8_t>) {{
+                        return -1;
+                    }} else {{
+                        using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, true, {F_squant}, {F_spad}, {F_dvpad}>;
                        return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
                    }}
+                }} else {{
+                    using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, false, {F_squant}, {F_spad}, {F_dvpad}>;
+                    return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
+                }}
+            }}
 """
 @dataclass
@@ -614,27 +624,26 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
        squant = 't' if dtype == 'fp8' else 'f'
        pipelines = []
        if dtype in ['fp16', 'bf16']:
-            for mask, bias, lse, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]):
+            for mask, bias, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]):
                # TODO: use async pipeline when compiler is more stable 
                if hdim == 256 or hdim in [32, 64, 128]:         ### [32, 64, 96, 128]:
                # if True:
-                    pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
                else:
-                    pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', bias, lse, squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', bias, lse, squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
                    if receipt == 1:
-                        pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask)) # TODO: cover arbitraty hdim
+                        pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
-                        pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', bias, lse, squant, pagedkv, mask)) # TODO: cover arbitraty hdim
+                        pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
        elif dtype in ['fp8', 'bf8']:
-            # no need lse/paged-kv kernels
            for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
-                pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 'f', squant, 'f', mask))
+                pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 't', squant, 'f', mask))
        else:
            assert False
        return pipelines
@@ -655,9 +664,6 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                    if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
                        # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
                        continue
-                    if pipeline.F_pagedkv == 't':
-                        # we only use batch mode kernels to handle (paged-) kvcache problems
-                        continue
                k = Kernel(F_idx=0,
                           F_hdim=hdim,
                           F_dtype=dtype,

--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -150,7 +150,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
        // create group mode kernel arguments
        if constexpr(FmhaBwdDQDKDVKernel::kIsGroupMode)
        {
-            return FmhaBwdDQDKDVKernel::MakeKargs(args.q_ptr,
+            return FmhaBwdDQDKDVKernel::MakeKargsImpl(args.q_ptr,
                                                      args.k_ptr,
                                                      args.v_ptr,
                                                      args.bias_ptr,
@@ -200,7 +200,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
        }
        else
        { // create batch mode kernel arguments
-            return FmhaBwdDQDKDVKernel::MakeKargs(args.q_ptr,
+            return FmhaBwdDQDKDVKernel::MakeKargsImpl(args.q_ptr,
                                                      args.k_ptr,
                                                      args.v_ptr,
                                                      args.bias_ptr,

--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -62,7 +62,7 @@ auto create_args(int argc, char* argv[])
                "-1 to choose s_knew in [1, s] randomly.")
        .insert("s_kpad",
                "-1",
-                "seqlen_k stride between 2 tokens, currently used in group-mode only\n"
+                "seqlen_k stride between 2 batches, currently used in group-mode only\n"
                "for kv-cache case, each batch [1,s,h,d]/[1,h,s,d] can have a stride\n"
                "along seqlen, instead of packed. same as xformer kv_padding")
        .insert("d", "128", "head dim for q, k")
@@ -294,7 +294,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
 #if !CK_TILE_FMHA_FWD_APPENDKV_API
    if(seqlen_knew != 0)
    {
-        std::cerr << "kvcache is not supported. ignoring the 's_knew' option" << std::endl;
+        std::cerr << "fmha_fwd_appendkv() is not enabled. ignoring the 's_knew' option"
+                  << std::endl;
        seqlen_knew = 0;
    }
 #endif
@@ -321,6 +322,13 @@ bool run(const ck_tile::ArgParser& arg_parser)
        rotary_dim = 0;
    }
 #endif
+    // to use fmha_fwd_appendkv(), make sure it's in batch mode
+    const bool need_append_kvcache = (0 < seqlen_knew || 0 < rotary_dim);
+    if(need_append_kvcache && mode == mode_enum::group)
+    {
+        std::cerr << "fmha_fwd_appendkv() will be invoked. ignoring the 'mode' option" << std::endl;
+        mode = mode_enum::batch;
+    }
    if(!(rotary_dim <= hdim_q))
    {
        std::cerr << "rotary_dim should be less than or equal to head dim for q" << std::endl;
@@ -356,22 +364,26 @@ bool run(const ck_tile::ArgParser& arg_parser)
                  << std::endl;
        use_cache_batch_idx = false;
    }
-#endif
+#else
-    if(0 < page_block_size && use_cache_batch_idx)
+    if(use_cache_batch_idx)
+    {
+        if(0 < page_block_size)
        {
            std::cerr << "paged-kvcache does not support cache_batch_idx. ignoring the "
                         "'cache_batch_idx' option"
                      << std::endl;
            use_cache_batch_idx = false;
        }
-    // the input tensor layout for kvcache is same as batch mode
+        else if(mode == mode_enum::group)
-    const bool need_append_kvcache = (0 < seqlen_knew || 0 < rotary_dim);
-    const bool use_kvcache = (need_append_kvcache || use_cache_batch_idx || 0 < page_block_size);
-    if(use_kvcache && mode != mode_enum::batch)
        {
-        std::cerr << "kvcache enabled. ignoring the 'mode' option" << std::endl;
+            std::cerr << "group mode will not use cache_batch_idx. ignoring the "
-        mode = mode_enum::batch;
+                         "'cache_batch_idx' option"
+                      << std::endl;
+            use_cache_batch_idx = false;
        }
+    }
+#endif
+    const bool use_kvcache = (need_append_kvcache || use_cache_batch_idx || 0 < page_block_size);
    auto [seqlen_qs, seqlen_ks, seqlen_kpads] =
        decode_seqlen(mode,
@@ -380,7 +392,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                      arg_parser.get_str("s_k"),
                      arg_parser.get_str("s_kpad"),
                      /*seqlen_k_min=*/0 < seqlen_knew ? seqlen_knew : 0,
-                      use_kvcache);
+                      need_append_kvcache);
    // compute kvcache seqlen_k (before appending knew/vnew)
    auto cache_seqlen_ks = seqlen_ks;
    std::transform(cache_seqlen_ks.begin(),
@@ -741,8 +753,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::DeviceMem o_buf(o_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t));
    ck_tile::DeviceMem seqstart_k(seqstart_k_host.size() * sizeof(int32_t));
-    ck_tile::DeviceMem seqlen_k_buf(
+    ck_tile::DeviceMem seqlen_k_buf((mode == mode_enum::batch && use_kvcache) ||
-        use_kvcache || 0 <= seqlen_kpads[0] ? seqlen_ks.size() * sizeof(int32_t) : 0);
+                                            0 <= seqlen_kpads[0]
+                                        ? seqlen_ks.size() * sizeof(int32_t)
+                                        : 0);
    ck_tile::DeviceMem cache_seqlen_k_buf(
        need_append_kvcache ? cache_seqlen_ks.size() * sizeof(int32_t) : 0);
    ck_tile::DeviceMem rotary_cos_buf(rotary_cos_host.get_element_space_size_in_bytes());
@@ -763,7 +777,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
    seqstart_q.ToDevice(seqstart_q_host.data());
    seqstart_k.ToDevice(seqlen_kpads[0] < 0 ? seqstart_k_host.data()
                                            : seqstart_k_with_padding_host.data());
-    seqlen_k_buf.ToDevice(use_kvcache || 0 <= seqlen_kpads[0] ? seqlen_ks.data() : nullptr);
+    seqlen_k_buf.ToDevice((mode == mode_enum::batch && use_kvcache) || 0 <= seqlen_kpads[0]
+                              ? seqlen_ks.data()
+                              : nullptr);
    cache_seqlen_k_buf.ToDevice(need_append_kvcache ? cache_seqlen_ks.data() : nullptr);
    rotary_cos_buf.ToDevice(rotary_cos_host.data());
    rotary_sin_buf.ToDevice(rotary_sin_host.data());
@@ -976,8 +992,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
                (mode == mode_enum::group ? seqstart_q.GetDeviceBuffer() : nullptr);
            args.seqstart_k_ptr =
                (mode == mode_enum::group ? seqstart_k.GetDeviceBuffer() : nullptr);
-            args.seqlen_k_ptr =
+            args.seqlen_k_ptr = ((mode == mode_enum::batch && use_kvcache) || 0 <= k_paddings_[0]
-                (use_kvcache || 0 <= k_paddings_[0] ? seqlen_k_buf.GetDeviceBuffer() : nullptr);
+                                     ? seqlen_k_buf.GetDeviceBuffer()
+                                     : nullptr);
            args.seqlen_k     = shape_seqlen_k; // unused in group mode (or kvcache enabled)
            args.max_seqlen_q = max_seqlen_q;
@@ -1029,6 +1046,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                    (0 < page_block_size ? block_table_buf.GetDeviceBuffer() : nullptr);
                args.batch_stride_block_table = batch_stride_block_table;
                args.page_block_size          = page_block_size;
+                args.is_gappy = false; // use 'false' for flash-attention integration
                args.cache_batch_idx =
                    (use_cache_batch_idx ? cache_batch_idx_buf.GetDeviceBuffer() : nullptr);

--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -165,6 +165,8 @@ struct fmha_fwd_splitkv_args
    void* block_table_ptr;
    ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr
    ck_tile::index_t page_block_size;          // only used if 'block_table_ptr' is not nullptr
+    bool is_gappy; // differentiate seqstart_k_ptr usage. only used if 'block_table_ptr' is not
+                   // nullptr.
    const void* cache_batch_idx;
@@ -173,9 +175,21 @@ struct fmha_fwd_splitkv_args
    //             seqlen_k = kargs.seqlen_k
    // group mode: seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
    //             seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b]
-    // kvcache mode (use same kernel as batch mode):
+    //                      or kargs.seqlen_k_ptr[b]
+    //
+    // batch mode (kvcache):
    //             seqlen_q = kargs.seqlen_q
+    //             seqlen_k = kargs.seqlen_k_ptr[b]
+    // group mode (kvcache):
+    //             seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
+    //
+    //     when is_gappy=true:
+    //             seqlen_k = kargs.seqlen_k_ptr[b]
+    //             seqstart_k_ptr[b] now store local offset of each batch
+    //
+    //     when is_gappy=false:
    //             seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b]
+    //                      or kargs.seqlen_k_ptr[b]
    const void* seqstart_q_ptr;
    const void* seqstart_k_ptr;
    const void* seqlen_k_ptr;
@@ -251,7 +265,7 @@ struct fmha_fwd_appendkv_args
    ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr
    ck_tile::index_t page_block_size;          // only used if 'block_table_ptr' is not nullptr
-    const void* cache_batch_idx;
+    const void* cache_batch_idx; // only used if block_table_ptr is nullptr -> batch mode (kvcache)
    ck_tile::index_t stride_q;
    ck_tile::index_t stride_k;
@@ -278,7 +292,7 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
        // create group mode kernel arguments
        if constexpr(FmhaKernel::kIsGroupMode)
        {
-            return FmhaKernel::MakeKargs(args.q_ptr,
+            return FmhaKernel::MakeKargsImpl(args.q_ptr,
                                             args.k_ptr,
                                             args.v_ptr,
                                             args.bias_ptr,
@@ -317,7 +331,7 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
        }
        else
        { // create batch mode kernel arguments
-            return FmhaKernel::MakeKargs(args.q_ptr,
+            return FmhaKernel::MakeKargsImpl(args.q_ptr,
                                             args.k_ptr,
                                             args.v_ptr,
                                             args.bias_ptr,
@@ -389,6 +403,10 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                     args.nhead_q,
                                     args.nhead_q / args.nhead_k,
                                     args.num_splits,
+                                     args.block_table_ptr,
+                                     args.batch_stride_block_table,
+                                     args.page_block_size,
+                                     args.is_gappy,
                                     args.scale_s,
                                     args.scale_p,
                                     args.stride_q,

--- a/example/ck_tile/01_fmha/utils.hpp
+++ b/example/ck_tile/01_fmha/utils.hpp
@@ -145,7 +145,7 @@ decode_seqlen(mode_enum mode,
              std::string k_val,
              std::string k_pad_val,
              ck_tile::index_t seqlen_k_min = 0,
-              bool use_kvcache              = false,
+              bool need_append_kvcache      = false,
              std::optional<unsigned> seed  = std::nullopt)
 {
 #define _S2I_(str_) static_cast<ck_tile::index_t>(std::atoi((str_).c_str()))
@@ -159,7 +159,7 @@ decode_seqlen(mode_enum mode,
            const ck_tile::index_t seqlen_k_max = (k < 0 ? q : k);
            std::vector<ck_tile::index_t> seqlen_ks(batch, seqlen_k_max);
-            if(1 < batch && use_kvcache)
+            if(1 < batch && need_append_kvcache)
            {
                // to keep the original s_k value, we always use seqlen_k_max in first batch
                randints(std::next(seqlen_ks.begin()),

--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
 add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
-add_executable(tile_example_gemm_mem_pipeline EXCLUDE_FROM_ALL gemm_mem_pipeline.cpp)
+add_executable(tile_example_universal_gemm EXCLUDE_FROM_ALL universal_gemm.cpp)
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -92,6 +92,11 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.kbatch);
    constexpr dim3 blocks = Kernel::BlockSize();
+    if(!Kernel::IsSupportedArgument(kargs))
+    {
+        throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+    }
    if(s.log_level_ > 0)
    {
        std::cout << "Launching kernel with args:"

--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -31,15 +31,13 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
    float ave_time = gemm_calc<ALayout, BLayout, CLayout>(
        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
-    std::string op_name{"Gemm{MemBoundPipeline}"};
    std::size_t flop = std::size_t(2) * M * N * K;
    std::size_t num_byte =
        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
    float gb_per_sec = num_byte / 1.E6 / ave_time;
-    std::cout << "Run " << op_name << "kernel with M =" << M << " N =" << N << " K =" << K
+    std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K
              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
              << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
              << std::endl;
@@ -114,7 +112,6 @@ int run_gemm_example_with_layouts(int argc,
        f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
    // TODO: add different init types
    ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
    ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
@@ -202,14 +199,16 @@ int run_gemm_example(int argc, char* argv[])
    {
        return run_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{});
    }
-    else if(a_layout == "C" && b_layout == "C")
+    // TODO: Fixme: with latest changes to GemmPipelineAGmemBGmemCRegV1DefaultPolicy below do not
-    {
+    // work.
-        return run_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{});
+    // else if(a_layout == "C" && b_layout == "C")
-    }
+    // {
-    else if(a_layout == "C" && b_layout == "R")
+    //     return run_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{});
-    {
+    // }
-        return run_gemm_example_with_layouts(argc, argv, Col{}, Row{}, Row{});
+    // else if(a_layout == "C" && b_layout == "R")
-    }
+    // {
+    //     return run_gemm_example_with_layouts(argc, argv, Col{}, Row{}, Row{});
+    // }
    else
    {
        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");