copy over fmha example

remove bwd related commands from cmakelists remove unused ops in the example; select only bf16/nodropout/nolse/batched pass validation in the example driver fork pipeline add a hardcoded score_mod fork the kernel abstract score_mod from a pipeline unhardcode score_mod and pass it as a cpp expression from codegen modify host attention impl accounting for score_mod use custom score for testing reorder score mod and scale in host verification use cmakelists as the single source of truth for score_mod function definition fix numeric mismatches run clang-format remove bwd related scripts edit test and benchmark scripts for the new example remove readme remove unused cases from smoke test re-add group-mode kernels Add pre_softmax fnctor (#1852) * Add pre_softmax fnctor * remove stray define:wq * Move op out of pipeline, adds it to refnc --------- Co-authored-by: root <root@splinter-126-wr-d1.aus.dcgpu> Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> added flex_attention in Jenkins file fixing clang fixing clang space added fixed copyright errors fixed even more clangformat formatting modified jenkins fixed typo added flex attention test for gfx90a and gfx942 fixed typo fixed example name fixed example script name added perf logs for both gpu arch pipeline fixes for accuracy issues; disable pre-softmax function until its accuracy is fixed added stash and unstash for perf logs fixed typo in perf name print error message print success message hardcoded perf files names flex attention jenkins switch off flex attention jenkins switch off from settings fixed typo add context to score-mod signature

copy over fmha example
remove bwd related commands from cmakelists remove unused ops in the example; select only bf16/nodropout/nolse/batched pass validation in the example driver fork pipeline add a hardcoded score_mod fork the kernel abstract score_mod from a pipeline unhardcode score_mod and pass it as a cpp expression from codegen modify host attention impl accounting for score_mod use custom score for testing reorder score mod and scale in host verification use cmakelists as the single source of truth for score_mod function definition fix numeric mismatches run clang-format remove bwd related scripts edit test and benchmark scripts for the new example remove readme remove unused cases from smoke test re-add group-mode kernels Add pre_softmax fnctor (#1852) * Add pre_softmax fnctor * remove stray define:wq * Move op out of pipeline, adds it to refnc --------- Co-authored-by: root <root@splinter-126-wr-d1.aus.dcgpu> Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> added flex_attention in Jenkins file fixing clang fixing clang space added fixed copyright errors fixed even more clangformat formatting modified jenkins fixed typo added flex attention test for gfx90a and gfx942 fixed typo fixed example name fixed example script name added perf logs for both gpu arch pipeline fixes for accuracy issues; disable pre-softmax function until its accuracy is fixed added stash and unstash for perf logs fixed typo in perf name print error message print success message hardcoded perf files names flex attention jenkins switch off flex attention jenkins switch off from settings fixed typo add context to score-mod signature
4007289a · Max Podkorytov · 8086bbe3 · 4007289a · 4007289a · 4007289a
Unverified Commit 4007289a authored Jan 22, 2025 by Max Podkorytov
20 changed files
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -349,6 +349,20 @@ def cmake_build(Map conf=[:]){
            echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
        }
    }
+    if (params.RUN_CK_TILE_FLEX_ATTENTION_TESTS){
+        try{
+            archiveArtifacts "perf_tile_flex_attn_*.log"
+            if (arch_type == 1){
+                stash includes: "perf_tile_flex_attn_gfx90a.log", name: "perf_tile_flex_attn_log_gfx90a"
+            }
+            else if (arch_type == 2){
+                stash includes: "perf_tile_flex_attn_gfx942.log", name: "perf_tile_flex_attn_log_gfx942"
+            }
+        }
+        catch(Exception err){
+            echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
+        }
+    }
    if (params.RUN_CK_TILE_GEMM_TESTS){
        try{
            archiveArtifacts "perf_tile_gemm_*.log"
@@ -663,6 +677,15 @@ def process_results(Map conf=[:]){
                            echo "could not locate the FMHA performance logs: ${err.getMessage()}."
                        }
                    }
+                    if (params.RUN_CK_TILE_FLEX_ATTENTION_TESTS){
+                        try{
+                            unstash "perf_tile_flex_attn_log_gfx90a"
+                            unstash "perf_tile_flex_attn_log_gfx942"
+                        }
+                        catch(Exception err){
+                            echo "could not locate the Flex Attention performance logs: ${err.getMessage()}."
+                        }
+                    }
                    if (params.RUN_CK_TILE_GEMM_TESTS){
                        try{
                            unstash "perf_tile_gemm_log_gfx942"
@@ -713,7 +736,7 @@ def process_results(Map conf=[:]){
 }

 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_FLEX_ATTENTION_TESTS=false;RUN_CK_TILE_GEMM_TESTS=true
                                              0 21 * * * % ROCMVERSION=6.3;hipTensor_test=true;RUN_CODEGEN_TESTS=true
                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
@@ -793,6 +816,10 @@ pipeline {
            name: "RUN_CK_TILE_FMHA_TESTS",
            defaultValue: false,
            description: "Run the ck_tile FMHA tests (default: OFF)")
+        booleanParam(
+            name: "RUN_CK_TILE_FLEX_ATTENTION_TESTS",
+            defaultValue: false,
+            description: "Run the ck_tile FLEX ATTENTION tests (default: ON)")
        booleanParam(
            name: "RUN_CK_TILE_GEMM_TESTS",
            defaultValue: true,
@@ -984,6 +1011,51 @@ pipeline {
                }
            }
        }
+        stage("Run RUN_CK_TILE_FLEX_ATTENTION_TESTS Test")
+        {
+
+            parallel
+            {   
+                stage("Run RUN_CK_TILE_FLEX_ATTENTION_TESTS Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_FLEX_ATTENTION_TESTS.toBoolean() } 
+                    }
+                    agent{ label rocmnode("gfx90a") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
+                                           make -j64 tile_example_flexattn_fwd && \
+                                           cd ../ &&
+                                           example/ck_tile/18_flexattn/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+                stage("Run RUN_CK_TILE_FLEX_ATTENTION_TESTS Tests on gfx942")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_FLEX_ATTENTION_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx942") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
+                                           make -j64 tile_example_flexattn_fwd && \
+                                           cd ../ &&
+                                           example/ck_tile/18_flexattn/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+            }
+        }
        stage("Run CK_TILE_GEMM Tests")
        {
            parallel

--- a/example/ck_tile/18_flexattn/CMakeLists.txt
+++ b/example/ck_tile/18_flexattn/CMakeLists.txt
+# validate user-specified fmha_fwd API list
+set(FMHA_FWD_KNOWN_APIS "fwd;fwd_splitkv;fwd_appendkv")
+set(FMHA_FWD_ENABLE_APIS "fwd" CACHE STRING
+    "semicolon-separated list of APIs to generate (${FMHA_FWD_KNOWN_APIS}) & link, or \"all\".")
+if(FMHA_FWD_ENABLE_APIS STREQUAL "all")
+  set(FMHA_FWD_ENABLE_APIS ${FMHA_FWD_KNOWN_APIS})
+endif()
+
+variable_watch(FMHA_SCORE_MOD_F)
+set(FMHA_SCORE_MOD_F [[s + static_cast<decltype(s)>((q_idx - v_idx) % 8)]])
+# set(FMHA_SCORE_MOD_F [[s]])
+
+variable_watch(FMHA_PRE_SOFTMAX_F)
+# set(FMHA_PRE_SOFTMAX_F [[static_cast<decltype(s)>(tanh(s*1.0)/1.0)]])
+set(FMHA_PRE_SOFTMAX_F [[s]])
+
+foreach(api ${FMHA_FWD_ENABLE_APIS})
+  if(NOT "${api}" IN_LIST FMHA_FWD_KNOWN_APIS)
+    message(FATAL_ERROR "${api} isn't a known api: ${FMHA_FWD_KNOWN_APIS}.")
+  endif()
+endforeach()
+
+# "fwd" is a must-have api for the fmha_fwd example, add it if not specified
+if(NOT "fwd" IN_LIST FMHA_FWD_ENABLE_APIS)
+  list(APPEND FMHA_FWD_ENABLE_APIS "fwd")
+endif()
+
+string(REPLACE ";" "," FMHA_FWD_APIS "${FMHA_FWD_ENABLE_APIS}")
+# generate a list of kernels, but not actually emit files at config sta
+execute_process(
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api ${FMHA_FWD_APIS} 
+  --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt
+  RESULT_VARIABLE ret
+)
+if(ret AND NOT ret EQUAL 0)
+  message( FATAL_ERROR "CK Tile FMHA FAILED to generate a list of FWD kernels via Python.")
+endif()
+
+# NOTE: for cmake, the FMHA_FWD_GEN_BLOBS files must be in the same directory
+#       as current cmake list, otherwise will not figure out the dependency properly
+file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt FMHA_FWD_GEN_BLOBS)
+
+add_custom_command(
+  OUTPUT ${FMHA_FWD_GEN_BLOBS}
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api ${FMHA_FWD_APIS} 
+  --output_dir ${CMAKE_CURRENT_BINARY_DIR}
+  "--score_mod_expr=${FMHA_SCORE_MOD_F}"
+  "--pre_softmax_expr=${FMHA_PRE_SOFTMAX_F}"
+  VERBATIM
+)
+
+set(EXAMPLE_FMHA_FWD "tile_example_flexattn_fwd")
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+message("adding example ${EXAMPLE_FMHA_FWD}")
+add_executable(${EXAMPLE_FMHA_FWD} EXCLUDE_FROM_ALL fmha_fwd.cpp)
+target_include_directories(${EXAMPLE_FMHA_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${EXAMPLE_FMHA_FWD} PRIVATE ${FMHA_FWD_GEN_BLOBS})
+
+
+# NOTE: this is dangerous since will change the whole kernel to flush denormals
+#       WIP with compiler team for an exp2 intrinsic..., then remove this
+if(NOT DEFINED FMHA_FWD_FAST_EXP2)
+    set(FMHA_FWD_FAST_EXP2 true)
+endif()
+
+set(EXAMPLE_FMHA_FWD_COMPILE_OPTIONS)
+
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+#       ... because they are auto-generated
+if(FMHA_FWD_FAST_EXP2)
+	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
+else()
+	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
+endif()
+
+# conditionally enable call to the fwd_splitkv API in fmha_fwd example
+if("fwd_splitkv" IN_LIST FMHA_FWD_ENABLE_APIS)
+  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_SPLITKV_API=1)
+else()
+  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_SPLITKV_API=0)
+endif()
+
+# conditionally enable call to the fwd_appendkv API in fmha_fwd example
+if("fwd_appendkv" IN_LIST FMHA_FWD_ENABLE_APIS)
+  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=1)
+else()
+  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=0)
+endif()
+
+# Allow comparing floating points directly in order to check sentinel values
+list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-float-equal)
+
+list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS "-DCK_TILE_SCORE_MOD_F=${FMHA_SCORE_MOD_F}")
+list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS "-DCK_PRE_SOFTMAX_F=${FMHA_PRE_SOFTMAX_F}")
+
+target_compile_options(${EXAMPLE_FMHA_FWD} PRIVATE ${EXAMPLE_FMHA_FWD_COMPILE_OPTIONS})
+
+# TODO: we have to turn off this global prop, otherwise the progress bar generated
+# by cmake will print too many files, execvp: /bin/sh: Argument list too long
+# however, this property may affect global
+# TODO: consider codegen a makefile by us
+set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
--- a/example/ck_tile/18_flexattn/bias.hpp
+++ b/example/ck_tile/18_flexattn/bias.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/flex_fmha.hpp"
+
+// keep sync with BlockAttentionBiasEnum
+enum class bias_enum
+{
+    no_bias          = 0,
+    elementwise_bias = 1,
+    alibi            = 2,
+};
+
+struct bias_info
+{
+    bias_enum type;
+    /*
+     * simple dispatch logic
+     *
+     * if type == elementwise_bias:
+     *      if rank_info == 0:
+     *           bias is 1*1*s*s
+     *      elif rank_info == 1:
+     *           bias is 1*h*s*s
+     *      elif rank_info == 2:
+     *           bias is b*h*s*s
+     *
+     * elif type == alibi:
+     *       if rank_info == 0:
+     *           alibi in 1*h
+     *       elif rank_info == 1:
+     *           alibi in b*h
+     */
+    int rank_info;
+
+    void serialize(std::ostream& os) const
+    {
+        if(type == bias_enum::no_bias)
+            os << "n";
+        else if(type == bias_enum::elementwise_bias)
+        {
+            os << "e";
+            if(rank_info != 0)
+            {
+                os << "[" << rank_info << "]";
+            }
+        }
+        else if(type == bias_enum::alibi)
+        {
+            os << "alibi";
+            if(rank_info != 0)
+            {
+                os << "[" << rank_info << "]";
+            }
+        }
+    }
+
+    static bias_info decode(std::string str)
+    {
+        bias_info info{bias_enum::no_bias, 0};
+        if(str == "0" || str == "n")
+        {
+            info.type = bias_enum::no_bias;
+        }
+        else if(str.compare(0, 1, "1") == 0 || str.compare(0, 1, "e") == 0 ||
+                str.compare(0, 11, "elementwise") == 0)
+        {
+            info.type    = bias_enum::elementwise_bias;
+            auto found_0 = str.find(':');
+            if(found_0 != std::string::npos)
+            {
+                std::string e  = str.substr(found_0 + 1);
+                info.rank_info = atoi(e.c_str());
+            }
+        }
+        else if(str.compare(0, 1, "2") == 0 || str.compare(0, 1, "a") == 0 ||
+                str.compare(0, 5, "alibi") == 0)
+        {
+            info.type    = bias_enum::alibi;
+            auto found_0 = str.find(':');
+            if(found_0 != std::string::npos)
+            {
+                std::string e  = str.substr(found_0 + 1);
+                info.rank_info = atoi(e.c_str());
+            }
+        }
+        return info;
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const bias_info& bi)
+    {
+        bi.serialize(os);
+        return os;
+    }
+};
--- a/example/ck_tile/18_flexattn/codegen/__init__.py
+++ b/example/ck_tile/18_flexattn/codegen/__init__.py
--- a/example/ck_tile/18_flexattn/codegen/cmake_config.py
+++ b/example/ck_tile/18_flexattn/codegen/cmake_config.py
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+GEN_DIR = ""    # in Cmake, have to generate files in same folder
\ No newline at end of file
--- a/example/ck_tile/18_flexattn/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/18_flexattn/codegen/cpp_symbol_map.py
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+FWD_DTYPE_MAP = {
+    "fp16"   : "FmhaFwdFp16",
+    "bf16"   : "FmhaFwdBf16",
+    "fp8"    : "FmhaFwdFp8",
+    "fp8fp16": "FmhaFwdFp8Fp16",
+    "fp8bf16": "FmhaFwdFp8Bf16"
+}
+
+BWD_DTYPE_MAP = {
+    "fp16": "FmhaBwdFp16",
+    "bf16": "FmhaBwdBf16"
+}
+
+MASK_IMPL = {
+    "generic" : "ck_tile::GenericAttentionMask",
+    "simplified"  : "ck_tile::SimplifiedGenericAttentionMask"
+}
+
+_MASK_SIMPLIFIED_MAP = {
+    "s_no" : "ck_tile::SimplifiedGenericAttentionMask<false>",
+    "s_mask" : "ck_tile::SimplifiedGenericAttentionMask<true>",
+}
+
+_MASK_MAP = {
+    "no" : "FmhaMasks::NoMask",
+    "causal" : "FmhaMasks::CausalMask",
+    "generic" : "FmhaMasks::GenericMask"
+}
+
+def get_mask_map(mask : str):
+    if mask == "generic":
+        return _MASK_MAP
+    elif mask == "simplified":
+        return _MASK_SIMPLIFIED_MAP
+    else:
+        assert False
+        return None
+
+_MASK_CHECK_MAP = {
+    "no" : "t.mask_type == mask_enum::no_mask",
+    "causal" : "t.mask_type == mask_enum::mask_top_left || t.mask_type == mask_enum::mask_bottom_right",
+    "generic" : "t.mask_type == mask_enum::window_generic",
+}
+
+_MASK_SIMPLIFIED_CHECK_MAP = {
+    "s_no" : "t.mask_type == mask_enum::no_mask",
+    "s_mask" : "t.mask_type != mask_enum::no_mask",
+}
+
+def get_mask_check_map(mask : str):
+    if mask == "generic":
+        return _MASK_CHECK_MAP
+    elif mask == "simplified":
+        return _MASK_SIMPLIFIED_CHECK_MAP
+    else:
+        assert False
+        return None
+
+BIAS_MAP = {
+    "no" : "ck_tile::BlockAttentionBiasEnum::NO_BIAS",
+    "bias"  : "ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS",
+    "alibi" : "ck_tile::BlockAttentionBiasEnum::ALIBI"
+}
+
+# TODO: this is ugly
+BIAS_CHECK_MAP = {
+    "no" : "bias_enum::no_bias",
+    "bias"  : "bias_enum::elementwise_bias",
+    "alibi" : "bias_enum::alibi"
+}
+
+DROPOUT_MAP = {
+    "no"                        : "ck_tile::BlockDropoutBwd<false, true,  false>",
+    "dropout_wg32"              : "ck_tile::BlockDropoutBwd<true,  true,  false>",
+    "dropout_wg32_storerandval" : "ck_tile::BlockDropoutBwd<true,  true,  true >",
+    "dropout_wg16"              : "ck_tile::BlockDropoutBwd<true,  false, false>",
+    "dropout_wg16_storerandval" : "ck_tile::BlockDropoutBwd<true,  false, true >"
+}
+
+DROPOUT_CHECK_MAP = {
+    "no"                        : "t.has_dropout == false",
+    "dropout_wg32"              : "t.has_dropout == true && t.is_store_randval == false",
+    "dropout_wg32_storerandval" : "t.has_dropout == true && t.is_store_randval == true",
+    "dropout_wg16"              : "t.has_dropout == true && t.is_store_randval == false",
+    "dropout_wg16_storerandval" : "t.has_dropout == true && t.is_store_randval == true",
+}
+
+ROPE_MAP = {
+    "no" : "ck_tile::RotaryEmbeddingEnum::NONE",
+    "inter"  : "ck_tile::RotaryEmbeddingEnum::INTERLEAVED",
+    "half" : "ck_tile::RotaryEmbeddingEnum::HALF_ROTATED"
+}
+
+ROPE_CHECK_MAP = {
+    "no"    : "rope_enum::none",
+    "inter" : "rope_enum::interleaved",
+    "half"  : "rope_enum::half_rotated"
+}
+
+MODE_MAP = {
+    "batch" : "false",
+    "group" : "true"
+}
+
+LAYOUT_MAP = {
+    "row" : "true",
+    "col" : "false"
+}
+
+PIPELINE_MAP = {
+    "qr" : "ck_tile::BlockFmhaPipelineQRKSVS",
+    "qr_async" : "ck_tile::BlockFmhaPipelineQRKSVSAsync",
+}
+
+PIPELINE_ENUM_MAP = {
+    "qr" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
+    "qr_async" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC",
+    "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
+}
+
+BOOL_MAP = {
+    "t" : "true",
+    "f" : "false"
+}
--- a/example/ck_tile/18_flexattn/codegen/ops/__init__.py
+++ b/example/ck_tile/18_flexattn/codegen/ops/__init__.py
--- a/example/ck_tile/18_flexattn/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/18_flexattn/codegen/ops/fmha_fwd.py
--- a/example/ck_tile/18_flexattn/fmha_fwd.cpp
+++ b/example/ck_tile/18_flexattn/fmha_fwd.cpp
--- a/example/ck_tile/18_flexattn/fmha_fwd.hpp
+++ b/example/ck_tile/18_flexattn/fmha_fwd.hpp
--- a/example/ck_tile/18_flexattn/generate.py
+++ b/example/ck_tile/18_flexattn/generate.py
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+import argparse
+from enum import IntEnum
+from pathlib import Path
+import pkgutil
+import sys
+from typing import List, Optional
+
+import codegen.ops
+from codegen.cmake_config import *
+
+
+class HandlerId(IntEnum):
+    LIST_BLOBS = 0
+    WRITE_BLOBS = 1
+
+# inspect all modules under 'codegen.ops' and register API handlers 
+ops = []
+for importer, module_name, _ in pkgutil.iter_modules(codegen.ops.__path__):
+    full_module_name = '%s.%s' % (codegen.ops.__name__, module_name)
+    if full_module_name not in sys.modules:
+        ops.append(importer.find_spec(module_name).loader.load_module(module_name))
+unwanted_prefix = 'fmha_'
+handlers = dict(
+    [(op.__name__[len(unwanted_prefix):] if op.__name__.startswith(unwanted_prefix) else op.__name__,
+        (op.list_blobs, op.write_blobs)) for op in ops]
+)
+assert 0 < len(handlers)
+
+def write_blobs(output_dir: Optional[str], api_list : List[str], kernel_filter : Optional[str], receipt, mask_impl, score_mod_expr, pre_softmax_expr) -> None:
+    if output_dir is None:
+        output_dir = Path(__file__).parent
+    else:
+        output_dir = Path(output_dir) / GEN_DIR
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for api in api_list:
+        handler = handlers[api][HandlerId.WRITE_BLOBS]
+        handler(output_dir, kernel_filter, receipt, mask_impl, score_mod_expr, pre_softmax_expr)
+
+# list all the files that will be generated
+def list_blobs(output_file : Optional[str], api_list : List[str], kernel_filter : Optional[str], receipt, mask_impl, score_mod_expr, pre_softmax_expr) -> None:
+    assert output_file is not None
+    file_path = Path(output_file)
+
+    # create an empty file / drop its contents if it exists
+    open(file_path, "w").close()
+
+    for api in api_list:
+        handler = handlers[api][HandlerId.LIST_BLOBS]
+        handler(file_path, kernel_filter, receipt, mask_impl, score_mod_expr, pre_softmax_expr)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate",
+        description="gen API for CK fmha kernel",
+    )
+    parser.add_argument(
+        "-d",
+        "--direction", # we keep 'direction' option for backward compatibility
+        "-a",
+        "--api",
+        default='fwd',
+        required=False,
+        help="supply API(s) to generate (default: fwd). separated by comma."
+    )
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        required=False,
+        help="write all the blobs into a directory"
+    )
+    parser.add_argument(
+        "-l",
+        "--list_blobs",
+        required=False,
+        help="list all the kernels to a file"
+    )
+    # TODO: if using filter, must apply same value to output_dir and list_blobs
+    parser.add_argument(
+        "-f",
+        "--filter",
+        required=False,
+        help="filter out kernels that need to generate, using fnmatch module"
+    )
+
+    parser.add_argument(
+        "-m",
+        "--mask",
+        default="simplified",
+        required=False,
+        help="mask implementation, simplified/generic"
+    )
+
+    parser.add_argument(
+        "-r",
+        "--receipt",
+        default=0,
+        required=False,
+        help="codegen receipt. 0: generate only 8xhdim coverage\n"  + \
+             "  1: generate more instance to cover all hdim\n"  + \
+             "  2: Only generate instance for Flash attention integration"
+    )
+
+    parser.add_argument(
+        "--score_mod_expr",
+        default="s",
+        required=False,
+        help="flex attention's score mod function, a cpp expression with `s`, `b`, `h`, `q_idx`, and `v_idx` variables"
+    )
+
+    parser.add_argument(
+        "--pre_softmax_expr",
+        default="s",
+        required=False,
+        help="flex attention's pre_softmax function, a cpp expression with `s` variable"
+    )
+
+    args = parser.parse_args()
+    api_list = args.direction.split(',')
+    if args.list_blobs is not None:
+        list_blobs(args.list_blobs, api_list, args.filter, int(args.receipt), mask_impl=args.mask, score_mod_expr=args.score_mod_expr, pre_softmax_expr=args.pre_softmax_expr)
+    else:
+        write_blobs(args.output_dir, api_list, args.filter, int(args.receipt), mask_impl=args.mask, score_mod_expr=args.score_mod_expr, pre_softmax_expr=args.pre_softmax_expr)
--- a/example/ck_tile/18_flexattn/mask.hpp
+++ b/example/ck_tile/18_flexattn/mask.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <ostream>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/flex_fmha.hpp"
+
+// keep this in sync with ck_tile::GenericAttentionMaskEnum
+enum class mask_enum
+{
+    no_mask = 0,
+    mask_top_left,
+    mask_bottom_right,
+    window_generic,
+};
+
+struct mask_info
+{
+    mask_enum type;
+    ck_tile::index_t y, x;
+    ck_tile::index_t left, right; // FA style SWA left/right
+
+    void serialize(std::ostream& os) const
+    {
+        if(type == mask_enum::no_mask)
+            os << "n";
+        else if(type == mask_enum::mask_top_left)
+            os << "t(" << left << ":" << right << ")";
+        else if(type == mask_enum::mask_bottom_right)
+            os << "b(" << left << ":" << right << ")";
+        else
+        {
+            os << "g(" << y << ":" << x << ")";
+        }
+    }
+    static mask_info decode(std::string str, ck_tile::index_t seqlen_q, ck_tile::index_t seqlen_k)
+    {
+        ck_tile::index_t x_total = seqlen_k;
+        ck_tile::index_t y_total = seqlen_q;
+        mask_info tmp;
+        auto found_0 = str.find(':');
+        if(found_0 != std::string::npos)
+        {
+            std::string t = str.substr(0, found_0);
+            std::string v = str.substr(found_0 + 1);
+            if(t == "xt" || t == "xb")
+            {
+                // xformer style sliding window attn from top-left
+                ck_tile::index_t window_size = atoi(v.c_str());
+                ck_tile::index_t left_size   = -1;
+                ck_tile::index_t right_size  = 0;
+                if(window_size > 0)
+                {
+                    left_size  = window_size / 2;
+                    right_size = window_size - 1 - left_size;
+                }
+                auto r = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
+                    left_size, right_size, y_total, x_total, t == "xt");
+
+                tmp.type  = t == "xt" ? mask_enum::mask_top_left : mask_enum::mask_bottom_right;
+                tmp.y     = r.at(ck_tile::number<0>{});
+                tmp.x     = r.at(ck_tile::number<1>{});
+                tmp.left  = left_size;
+                tmp.right = right_size;
+            }
+            else
+            {
+                auto found_1 = v.find(",");
+                if(found_1 == std::string::npos)
+                {
+                    printf("not supported value %s, %s\n", v.c_str(), str.c_str());
+                    assert(0);
+                }
+                tmp.type            = mask_enum::window_generic;
+                ck_tile::index_t v0 = atoi(v.substr(0, found_1).c_str());
+                ck_tile::index_t v1 = atoi(v.substr(found_1 + 1).c_str());
+                // TODO: some validation
+                if(t == "t")
+                {
+                    tmp.type = mask_enum::mask_top_left;
+                    auto r   = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
+                        v0, v1, y_total, x_total, true);
+                    tmp.y     = r.at(ck_tile::number<0>{});
+                    tmp.x     = r.at(ck_tile::number<1>{});
+                    tmp.left  = v0;
+                    tmp.right = v1;
+                }
+                else if(t == "b")
+                {
+                    tmp.type = mask_enum::mask_bottom_right;
+                    auto r   = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
+                        v0, v1, y_total, x_total, false);
+                    tmp.y     = r.at(ck_tile::number<0>{});
+                    tmp.x     = r.at(ck_tile::number<1>{});
+                    tmp.left  = v0;
+                    tmp.right = v1;
+                }
+                else if(t == "g")
+                {
+                    tmp.y     = v0;
+                    tmp.x     = v1;
+                    tmp.left  = v0; // TODO: don't use this?
+                    tmp.right = v1;
+                }
+                else
+                {
+                    printf("not supported type %s, %s\n", t.c_str(), str.c_str());
+                    assert(0);
+                }
+            }
+        }
+        else
+        {
+            auto set_causal_top_left = [&]() {
+                tmp.type  = mask_enum::mask_top_left;
+                tmp.y     = seqlen_q;
+                tmp.x     = 1;
+                tmp.left  = -1;
+                tmp.right = 0;
+            };
+            auto set_causal_bottom_right = [&]() {
+                tmp.type  = mask_enum::mask_bottom_right;
+                tmp.y     = seqlen_q;
+                tmp.x     = seqlen_k - seqlen_q + 1;
+                tmp.left  = -1;
+                tmp.right = 0;
+            };
+            if(str == "t")
+                set_causal_top_left();
+            else if(str == "b")
+                set_causal_bottom_right();
+            else
+            {
+                tmp.type = static_cast<mask_enum>(atoi(str.c_str()));
+                if(tmp.type == mask_enum::mask_top_left)
+                {
+                    set_causal_top_left();
+                }
+                else if(tmp.type == mask_enum::mask_bottom_right)
+                {
+                    set_causal_bottom_right();
+                }
+            }
+        }
+        return tmp;
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const mask_info& mi)
+    {
+        mi.serialize(os);
+        return os;
+    }
+};
--- a/example/ck_tile/18_flexattn/rotary.hpp
+++ b/example/ck_tile/18_flexattn/rotary.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+#include <cassert>
+#include <cmath>
+#include <functional>
+#include <iterator>
+#include <optional>
+#include <random>
+#include <tuple>
+
+// keep sync with RotaryEmbeddingEnum
+enum class rope_enum
+{
+    none         = 0,
+    interleaved  = 1,
+    half_rotated = 2,
+};
+
+template <typename DataType>
+std::tuple<ck_tile::HostTensor<DataType>, ck_tile::HostTensor<DataType>>
+generate_rotary_cos_sin(ck_tile::index_t seqlen,
+                        ck_tile::index_t rotary_dim,
+                        std::optional<unsigned> seed = std::nullopt)
+{
+    // return dummy tensors if we won't apply RoPE at all
+    if(rotary_dim <= 0)
+    {
+        ck_tile::HostTensor<DataType> dummy({1, 1});
+        return std::make_tuple(dummy, dummy);
+    }
+
+    std::mt19937 random_engine(seed.has_value() ? *seed : std::random_device{}());
+    std::uniform_real_distribution<float> generator(0.0f, 1.0f);
+
+    const ck_tile::index_t num_rows = seqlen * 2;
+    const ck_tile::index_t num_cols = rotary_dim / 2;
+
+    using std::begin, std::end;
+
+    ck_tile::HostTensor<float> angle({num_rows, num_cols});
+    std::generate(begin(angle), end(angle), [&] { return generator(random_engine) * 2 * M_PI; });
+
+    ck_tile::HostTensor<DataType> cos({num_rows, num_cols});
+    std::transform(begin(angle), end(angle), begin(cos), [](float origin_value) {
+        return ck_tile::type_convert<DataType>(std::cos(origin_value));
+    });
+
+    ck_tile::HostTensor<DataType> sin({num_rows, num_cols});
+    std::transform(begin(angle), end(angle), begin(sin), [](float origin_value) {
+        return ck_tile::type_convert<DataType>(std::sin(origin_value));
+    });
+
+    return std::make_tuple(cos, sin);
+}
+
+template <typename DataType>
+std::tuple<ck_tile::HostTensor<DataType>, ck_tile::HostTensor<DataType>>
+slice_rotary_cos_sin(const ck_tile::HostTensor<DataType>& cos,
+                     const ck_tile::HostTensor<DataType>& sin,
+                     ck_tile::index_t seqlen_offset,
+                     ck_tile::index_t seqlen)
+{
+    assert(cos.get_num_of_dimension() == 2 && sin.get_num_of_dimension() == 2);
+    assert(cos.get_length(0) == sin.get_length(0) && cos.get_length(1) == sin.get_length(1));
+
+    assert(static_cast<std::size_t>(seqlen_offset + seqlen) <= cos.get_length(0));
+
+    const ck_tile::index_t num_rows = seqlen;
+    const ck_tile::index_t num_cols = cos.get_length(1);
+
+    ck_tile::HostTensor<DataType> cos_pt({num_rows, num_cols});
+    cos_pt.ForEach([&](auto& self, auto i) { self(i) = cos(i[0] + seqlen_offset, i[1]); });
+
+    ck_tile::HostTensor<DataType> sin_pt({num_rows, num_cols});
+    sin_pt.ForEach([&](auto& self, auto i) { self(i) = sin(i[0] + seqlen_offset, i[1]); });
+
+    return std::make_tuple(cos_pt, sin_pt);
+}
--- a/example/ck_tile/18_flexattn/script/benchmark_fwd.sh
+++ b/example/ck_tile/18_flexattn/script/benchmark_fwd.sh
+#!/bin/sh
+# TODO: run this script from CK root or build directory
+EXE="$(find . -name tile_example_flexattn_fwd -type f | head -n 1)"
+VALID=0
+
+for prec in "bf16" ; do
+for perm in 0 ; do
+for hdim in 64 128 256 ; do
+
+nhead=$((2048 / $hdim))     # follow fav2 setup
+$EXE -prec=$prec -b=32 -h=$nhead -d=$hdim -s=512   -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=16 -h=$nhead -d=$hdim -s=1024  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=8  -h=$nhead -d=$hdim -s=2048  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=4  -h=$nhead -d=$hdim -s=4096  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=2  -h=$nhead -d=$hdim -s=8192  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=1  -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+
+done
+done
+done
--- a/example/ck_tile/18_flexattn/script/run_full_test.sh
+++ b/example/ck_tile/18_flexattn/script/run_full_test.sh
+#!/bin/bash 
+#
+# in order to run this script you'd first need to build the tile_example_fmha_fwd and tile_eaxmple_fmha_bwd executables in ../build/bin/
+#
+# run the script as "./run_full_test.sh <tag for your test environment> <branch name> <host name> <gpu_arch>
+# input arguments: 
+# environment tag  : a string describing the specifics of your test environment
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# host name        : $hostname
+# gpu architecture: e.g., gfx90a, or gfx942, etc.
+
+#get the command line arguments:
+export env_type=$1
+echo 'Environment type: ' $env_type
+export branch=$2
+echo 'Branch name: ' $branch
+export host_name=$3
+echo 'Host name: ' $host_name
+export GPU_arch=$4
+echo 'GPU_arch: ' $GPU_arch
+
+function print_log_header(){
+	rm -f $1;
+	echo 'On branch ' $3 &> $1;
+	echo 'Node name: ' $4 >> $1;
+	#get GPU_arch and number of compute units from rocminfo
+	echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
+	rocminfo | grep "Compute Unit:" >> $1;
+	hipcc --version | grep -e 'HIP version'  >> $1;
+	echo 'Environment type: ' $2 >> $1;
+	/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
+}
+
+#run verification tests
+example/ck_tile/18_flexattn/script/smoke_test_fwd.sh
+
+#run performance benchmarks
+export fmha_fwd_log="perf_tile_flex_attn_$GPU_arch.log"
+print_log_header $fmha_fwd_log $env_type $branch $host_name
+echo "Running performance benchmark for tile_flex_attn_fwd"
+example/ck_tile/18_flexattn/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log
+echo "Finishing performance benchmark for tile_flex_attn_fwd"
+
--- a/example/ck_tile/18_flexattn/script/smoke_test_fwd.sh
+++ b/example/ck_tile/18_flexattn/script/smoke_test_fwd.sh
+#!/bin/bash
+# TODO: run this script from CK root or build directory
+EXE="$(find . -name tile_example_flexattn_fwd -type f | head -n 1)"
+KNAME=1
+
+export CK_WARMUP=0
+export CK_REPEAT=1
+
+COMMON_ARGS='-v=1 -warmup=0 -repeat=1'
+# mode=0
+# export HIP_VISIBLE_DEVICES=4
+
+TEST_SPLITKV=0
+TEST_APPENDKV=0
+# options:
+#    -s: run splitkv tests
+#    -a: run appendkv tests
+while getopts ":sa" opt; do
+    case "${opt}" in
+        s)
+            TEST_SPLITKV=1
+            ;;
+        a)
+            TEST_APPENDKV=1
+            ;;
+        *)
+            ;;
+    esac
+done
+
+run_fp16_bf16_tests() {
+    local NUM_SPLITS="1"
+    local PAGE_BLOCK_SIZE="0"
+    local CACHE_BATCH_IDX="0"
+
+    if [ $TEST_SPLITKV -eq 1 ] ; then
+        NUM_SPLITS="$NUM_SPLITS 2 3"
+        PAGE_BLOCK_SIZE="$PAGE_BLOCK_SIZE 128"
+        CACHE_BATCH_IDX="$CACHE_BATCH_IDX 1"
+    fi
+
+    for prec in "bf16" ; do
+    for mode in 0 1; do
+    for perm in 0 ; do
+    for vlayout in "r" ; do
+    for hdim in 32 64 128 256 ; do
+    for lse in 0 ; do
+    for bias in "n" ; do
+    for p_drop in 0.0 ; do
+    for num_splits in $NUM_SPLITS ; do
+    for page_block_size in $PAGE_BLOCK_SIZE ; do
+    for cache_batch_idx in $CACHE_BATCH_IDX ; do
+
+    # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16, -d_v=$hdim -s=55 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=3 -d=$hdim -s=100 -s_k=51 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=16 -d_v=$hdim -s=99 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1024 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -d_v=24 -s=3 -s_k=99 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim -s=200 -s_k=520 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -s=99 -s_k=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=33 -s_k=0 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1 -s_k=10 -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+
+    done ; done ; done ; done ; done
+    done ; done ; done ; done ; done
+    done ;
+}
+
+run_fp8_tests() {
+    for perm in 0 1 ; do
+    for bias in "n" "e" "a" ; do
+    for b in 1 2 ; do
+    for hdim in 64 128 256 ; do
+    
+    $EXE -prec=fp8 -init=3 -b=$b -h=1 -d=128 -s=128 -bias=$bias -iperm=$perm -operm=$perm -vlayout=c -squant=1 -kname=$KNAME $COMMON_ARGS  
+
+    done ; done ; done ; done
+}
+
+run_fp16_appendkv_tests() {
+    for s in $(seq 63 1 65) ; do
+    for s_k in 65 129 ; do
+    for s_knew in 0 64 $s_k ; do
+    for hdim in 32 64 128 256 ; do
+    for ri in 0 1 ; do
+    for rdim in 0 16 32 $hdim ; do
+    for page_block_size in 0 128 ; do
+    for cache_batch_idx in 0 1 ; do
+
+    $EXE -prec=fp16 -b=3 -h=3 -d=$hdim -s=$s -s_k=$s_k -s_knew=$s_knew -rotary_dim=$rdim -rotary_interleaved=$ri -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -iperm=1 -operm=1 -kname=1 $COMMON_ARGS
+
+    done ; done ; done ; done ; done 
+    done ; done ; done
+}
+
+set -x
+
+run_fp16_bf16_tests
+# run_fp8_tests
+
+# if [ $TEST_APPENDKV -eq 1 ] ; then
+#     run_fp16_appendkv_tests
+# fi
+
+set +x
--- a/example/ck_tile/18_flexattn/utils.hpp
+++ b/example/ck_tile/18_flexattn/utils.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "ck_tile/core/container/span.hpp"
+
+enum class mode_enum
+{
+    batch = 0,
+    group
+};
+
+std::ostream& operator<<(std::ostream& stream, mode_enum mode)
+{
+    return stream << (mode == mode_enum::batch ? "batch" : "group");
+}
+
+std::vector<int32_t> to_seqstarts(ck_tile::span<const int32_t> seqlens)
+{
+    std::vector<int32_t> seqstarts = {0};
+    for(int32_t seqlen : seqlens)
+    {
+        seqstarts.push_back(seqstarts.back() + seqlen);
+    }
+    assert(seqstarts.size() == seqlens.size() + 1);
+    return seqstarts;
+}
+
+std::vector<int32_t> generate_seqlens(mode_enum mode,
+                                      unsigned count,
+                                      int32_t seqlen_avg,
+                                      int32_t seqlen_min = -1, // if not negative, clamp min
+                                      int32_t seqlen_max = -1, // if not negative, clamp max
+                                      std::optional<unsigned> seed = std::nullopt)
+{
+    assert(0 < count);
+
+    seqlen_min = (0 < seqlen_min ? seqlen_min : 1);
+    seqlen_max = (0 < seqlen_max ? seqlen_max : std::numeric_limits<int32_t>::max());
+    assert(seqlen_min <= seqlen_max);
+
+    std::vector<int32_t> seqlens(count, std::clamp(seqlen_avg, seqlen_min, seqlen_max));
+
+    if(mode == mode_enum::group && 1 < count)
+    {
+        using size_type = std::vector<int32_t>::size_type;
+
+        std::mt19937 random_engine(seed.has_value() ? *seed : std::random_device{}());
+        std::uniform_int_distribution<size_type> idx_dist(0, count - 1);
+        auto next_idx = std::bind(idx_dist, std::ref(random_engine));
+
+        std::uniform_int_distribution<size_type> step_dist(1, count - 1);
+        auto next_step = std::bind(step_dist, std::ref(random_engine));
+
+        for(unsigned repeat = seqlen_avg * (count / 2); 0 < repeat; --repeat)
+        {
+            const size_type to_decrease = next_idx();
+            // make sure each elements of seqlens is in range [seqlen_min, seqlen_max]
+            if(seqlens[to_decrease] == seqlen_min)
+            {
+                continue;
+            }
+
+            const size_type to_increase = (to_decrease + next_step()) % count;
+
+            if(seqlens[to_increase] >= seqlen_max)
+            {
+                continue;
+            }
+
+            --seqlens[to_decrease];
+            ++seqlens[to_increase];
+        }
+    }
+
+    return seqlens;
+}
+
+std::vector<int32_t> generate_seqstarts(mode_enum mode,
+                                        unsigned count,
+                                        int32_t seqlen_avg,
+                                        int32_t seqlen_min           = -1,
+                                        int32_t seqlen_max           = -1,
+                                        std::optional<unsigned> seed = std::nullopt)
+{
+    return to_seqstarts(generate_seqlens(mode, count, seqlen_avg, seqlen_min, seqlen_max, seed));
+}
+
+// return random integer generated uniformly in range [low, high]
+template <typename Int = int>
+auto randint(Int low, Int high, std::optional<unsigned> seed = std::nullopt)
+    -> std::enable_if_t<std::is_integral_v<Int>, Int>
+{
+    std::mt19937 engine(seed.has_value() ? *seed : std::random_device{}());
+    std::uniform_int_distribution<Int> dist(low, high);
+    return dist(engine);
+}
+
+// return random integers generated uniformly in range [low, high]
+template <typename Int, typename ForwardIterator>
+auto randints(ForwardIterator first,
+              ForwardIterator last,
+              Int low,
+              Int high,
+              std::optional<unsigned> seed = std::nullopt)
+    -> std::enable_if_t<std::is_integral_v<Int>>
+{
+    std::mt19937 engine(seed.has_value() ? *seed : std::random_device{}());
+    std::uniform_int_distribution<Int> dist(low, high);
+
+    std::generate(first, last, [&] { return dist(engine); });
+}
+
+/*
+ * decode the seqlen string from cmdline
+ * example (assume batch=3)
+ *   q_val=1,2,3 k_val=4,5,6 -> OK
+ *   q_val=1,2,3             -> OK, k same as q
+ *   q_val=1,2               -> OK, q will rand remaining 1 element, k same as q
+ *   q_val=1,2   k_val=4,5   -> OK, q/k will rand remaining 1 element
+ *   q_val=1,2,3,4           -> OK, but ignore exceed one
+ *
+ *   q_val=1,2   k_val=4,5,6 -> not OK, k must have same splits with q
+ *   q_val=1,2   k_val=4     -> not OK, k must have same splits with q
+ */
+std::tuple<std::vector<ck_tile::index_t>,
+           std::vector<ck_tile::index_t>,
+           std::vector<ck_tile::index_t>>
+decode_seqlen(mode_enum mode,
+              ck_tile::index_t batch,
+              std::string q_val,
+              std::string k_val,
+              std::string k_pad_val,
+              ck_tile::index_t seqlen_k_min = 0,
+              bool need_append_kvcache      = false,
+              std::optional<unsigned> seed  = std::nullopt)
+{
+#define _S2I_(str_) static_cast<ck_tile::index_t>(std::atoi((str_).c_str()))
+    if(mode == mode_enum::batch)
+    {
+        ck_tile::index_t q = _S2I_(q_val);
+        ck_tile::index_t k = _S2I_(k_val);
+
+        auto s_q = std::vector<ck_tile::index_t>(batch, q);
+        auto s_k = [&] {
+            const ck_tile::index_t seqlen_k_max = (k < 0 ? q : k);
+            std::vector<ck_tile::index_t> seqlen_ks(batch, seqlen_k_max);
+
+            if(1 < batch && need_append_kvcache)
+            {
+                // to keep the original s_k value, we always use seqlen_k_max in first batch
+                randints(std::next(seqlen_ks.begin()),
+                         seqlen_ks.end(),
+                         seqlen_k_min,
+                         seqlen_k_max,
+                         seed);
+                return seqlen_ks;
+            }
+
+            return seqlen_ks;
+        }();
+        auto s_kpad = std::vector<ck_tile::index_t>(batch, -1); // TODO: batch not support k_padding
+
+        // s_k should be greater than or equal to seqlen_k_min if provided
+        if(s_k.back() < seqlen_k_min)
+        {
+            std::ostringstream msg;
+            msg << __FILE__ << ":" << __LINE__ << ": seqlen_k (=" << s_k.back()
+                << ") is less than minimum seqlen_k (=" << seqlen_k_min << ")";
+            throw std::runtime_error(msg.str());
+        }
+
+        return std::make_tuple(s_q, s_k, s_kpad);
+    }
+    else
+    {
+        ck_tile::index_t idx          = 0;
+        std::string::size_type pos_q  = 0;
+        std::string::size_type pos_k  = 0;
+        std::string::size_type pos_kp = 0;
+        std::vector<ck_tile::index_t> s_q;
+        std::vector<ck_tile::index_t> s_k;
+        std::vector<ck_tile::index_t> s_kpad;
+        while(true)
+        {
+            auto found_q  = q_val.find(',', pos_q);
+            auto found_k  = k_val.find(',', pos_k);
+            auto found_kp = k_pad_val.find(',', pos_kp);
+
+            ck_tile::index_t q = _S2I_(
+                q_val.substr(pos_q, found_q == std::string::npos ? found_q : found_q - pos_q));
+            ck_tile::index_t k = _S2I_(
+                k_val.substr(pos_k, found_k == std::string::npos ? found_k : found_k - pos_k));
+            ck_tile::index_t kp = _S2I_(k_pad_val.substr(
+                pos_kp, found_kp == std::string::npos ? found_kp : found_kp - pos_kp));
+
+            s_q.push_back(q);
+            s_k.push_back(k < 0 ? q : k);
+            s_kpad.push_back(kp);
+
+            // s_k should be greater than or equal to seqlen_k_min
+            if(s_k.back() < seqlen_k_min)
+            {
+                std::ostringstream msg;
+                msg << __FILE__ << ":" << __LINE__ << ": seqlen_k (=" << s_k.back()
+                    << ") is less than minimum seqlen_k (=" << seqlen_k_min << ")";
+                throw std::runtime_error(msg.str());
+            }
+
+            idx++;
+            if(found_q == std::string::npos || idx >= batch)
+            {
+                break;
+            }
+            pos_q  = found_q + 1;
+            pos_k  = found_k == std::string::npos ? pos_k : found_k + 1;
+            pos_kp = found_kp == std::string::npos ? pos_kp : found_kp + 1;
+        }
+        if(idx < batch)
+        {
+            auto rem_q = generate_seqlens(mode, batch - idx, s_q.back(), 1, s_kpad.back(), seed);
+            auto rem_k =
+                generate_seqlens(mode, batch - idx, s_k.back(), seqlen_k_min, s_kpad.back(), seed);
+
+            s_q.insert(s_q.end(), rem_q.begin(), rem_q.end());
+            s_k.insert(s_k.end(), rem_k.begin(), rem_k.end());
+            s_kpad.insert(s_kpad.end(), batch - idx, s_kpad.back());
+        }
+        return std::make_tuple(s_q, s_k, s_kpad);
+    }
+#undef _S2I_
+}
+
+int env_get_int(const char* var_name, int default_int)
+{
+    char* v = getenv(var_name);
+    int r   = default_int;
+    if(v)
+        r = std::atoi(v);
+    return r;
+}
+
+template <typename RandomAccessIterator, typename Int>
+std::enable_if_t<std::is_integral_v<Int>> iota_shuffle(RandomAccessIterator first,
+                                                       RandomAccessIterator last,
+                                                       Int value,
+                                                       std::optional<unsigned> seed = std::nullopt)
+{
+    std::iota(first, last, value);
+
+    std::mt19937 engine(seed.has_value() ? *seed : std::random_device{}());
+    std::shuffle(first, last, engine);
+}
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -17,4 +17,5 @@ add_subdirectory(14_moe_smoothquant)
 add_subdirectory(15_fused_moe)
 add_subdirectory(16_batched_gemm)
 add_subdirectory(17_grouped_gemm)
+add_subdirectory(18_flexattn)
 add_subdirectory(35_batched_transpose)
--- a/include/ck_tile/ops/flex_fmha.hpp
+++ b/include/ck_tile/ops/flex_fmha.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/block/block_dropout.hpp"
+#include "ck_tile/ops/fmha/block/block_masking.hpp"
+#include "ck_tile/ops/fmha/block/block_position_encoding.hpp"
+#include "ck_tile/ops/fmha/block/block_rotary_embedding.hpp"
+#include "ck_tile/ops/fmha/block/page_block_navigator.hpp"
+#include "ck_tile/ops/fmha/kernel/fmha_flex_fwd_kernel.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_flex_qr_ks_vs.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_flex_qr_ks_vs_async.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_default_policy.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp"
+#include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp"
+#include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
--- a/include/ck_tile/ops/fmha/kernel/fmha_flex_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_flex_fwd_kernel.hpp