Merge branch 'develop' into ck_tile_gemmkernel_reuse

75535dd8 · Aleksander Dudek · b4f65acf · 77a38e02 · 75535dd8 · 75535dd8
Commit 75535dd8 authored Dec 12, 2024 by Aleksander Dudek
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
-FROM ubuntu:20.04
+FROM ubuntu:22.04
 ARG DEBIAN_FRONTEND=noninteractive
 ARG ROCMVERSION=6.3
 ARG compiler_version=""
@@ -48,6 +48,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    libnuma-dev \
    libpthread-stubs0-dev \
    llvm-amdgpu \
+    mpich \
    net-tools \
    pkg-config \
    python \
@@ -70,7 +71,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    rm -rf /var/lib/apt/lists/* && \
    rm -rf amdgpu-install* && \
 # Remove unnecessary rocm components that take a lot of space
-    apt-get remove -y rocblas rocfft rocsparse composablekernel-dev
+    apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt
 # Update the cmake to version 3.27.5
 RUN pip install --upgrade cmake==3.27.5 && \

--- a/Dockerfile.compiler
+++ b/Dockerfile.compiler
-ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.3"
+ARG BASE_DOCKER="rocm/composable_kernel:ck_ub22.04_rocm6.3"
 FROM $BASE_DOCKER
 ARG compiler_version=""
 ARG compiler_commit=""

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -40,10 +40,10 @@ def getBaseDockerImageName(){
    else{
        def ROCM_numeric = "${params.ROCMVERSION}" as float
        if ( ROCM_numeric < 6.4 ){
-            img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
+            img = "${env.CK_DOCKERHUB}:ck_ub22.04_rocm${params.ROCMVERSION}"
            }
        else{
-            img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}"
+            img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub22.04_rocm${params.ROCMVERSION}"
            }
        }
    return img
@@ -357,7 +357,7 @@ def buildHipClangJob(Map conf=[:]){
        def prefixpath = conf.get("prefixpath", "/opt/rocm")
        // Jenkins is complaining about the render group 
-        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
        if (conf.get("enforce_xnack_on", false)) {
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
@@ -377,7 +377,7 @@ def buildHipClangJob(Map conf=[:]){
        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
-                timeout(time: 48, unit: 'HOURS')
+                timeout(time: 20, unit: 'HOURS')
                {
                    cmake_build(conf)
                }
@@ -426,7 +426,7 @@ def Build_CK(Map conf=[:]){
        def prefixpath = conf.get("prefixpath", "/opt/rocm")
        // Jenkins is complaining about the render group 
-        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
        if (conf.get("enforce_xnack_on", false)) {
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
@@ -449,7 +449,7 @@ def Build_CK(Map conf=[:]){
            try {
                (retimage, image) = getDockerImage(conf)
                withDockerContainer(image: image, args: dockerOpts) {
-                    timeout(time: 5, unit: 'MINUTES'){
+                    timeout(time: 2, unit: 'MINUTES'){
                        sh 'rocminfo | tee rocminfo.log'
                        if ( !runShell('grep -n "gfx" rocminfo.log') ){
                            throw new Exception ("GPU not found")
@@ -465,7 +465,7 @@ def Build_CK(Map conf=[:]){
                throw e
            }
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
-                timeout(time: 12, unit: 'HOURS')
+                timeout(time: 20, unit: 'HOURS')
                {
                    //check whether to run performance tests on this node
                    def arch_type = 0
@@ -620,7 +620,7 @@ def process_results(Map conf=[:]){
    }
    withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
-        timeout(time: 1, unit: 'HOURS'){
+        timeout(time: 15, unit: 'MINUTES'){
            try{
                dir("script"){
                    if (params.RUN_CK_TILE_FMHA_TESTS){
@@ -675,8 +675,8 @@ def process_results(Map conf=[:]){
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
                                              0 21 * * * % ROCMVERSION=6.3;hipTensor_test=true;RUN_CODEGEN_TESTS=true
-                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
+                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
-                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
+                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                              0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false
                                              0 13 * * * % BUILD_LEGACY_OS=true''' : ""
@@ -763,8 +763,8 @@ pipeline {
            description: "Test building instances for various architectures simultaneously (default: OFF)")
        booleanParam(
            name: "BUILD_GFX12",
-            defaultValue: false,
+            defaultValue: true,
-            description: "Build CK and run tests on gfx12 (default: OFF)")
+            description: "Build CK and run tests on gfx12 (default: ON)")
        booleanParam(
            name: "NINJA_BUILD_TRACE",
            defaultValue: false,

--- a/codegen/test/rtc/include/rtc/hip.hpp
+++ b/codegen/test/rtc/include/rtc/hip.hpp
@@ -4,6 +4,7 @@
 #include <hip/hip_runtime_api.h>
 #include <memory>
 #include <string>
+#include <stdexcept>
 namespace rtc {

--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -2,10 +2,17 @@
 # Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 # generate kernel instances to speed up compilation
-DTYPE_MAP = {
+FWD_DTYPE_MAP = {
-    "fp16": "ck_tile::fp16_t",
+    "fp16"   : "FmhaFwdFp16",
-    "bf16": "ck_tile::bf16_t",
+    "bf16"   : "FmhaFwdBf16",
-    "fp8" : "ck_tile::fp8_t"
+    "fp8"    : "FmhaFwdFp8",
+    "fp8fp16": "FmhaFwdFp8Fp16",
+    "fp8bf16": "FmhaFwdFp8Bf16"
+}
+BWD_DTYPE_MAP = {
+    "fp16": "FmhaBwdFp16",
+    "bf16": "FmhaBwdBf16"
 }
 MASK_IMPL = {

--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -283,7 +283,7 @@ class FmhaBwdApiPool:
                        inners = inners + FMHA_BWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_pipeline_enum=BWD_DQDKDV_PIPELINE_ENUM_MAP[trait.pipeline],
                                    F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias],
                                    F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
-                                    F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=DTYPE_MAP[dtype],
+                                    F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=BWD_DTYPE_MAP[dtype],
                                    F_spad0=BOOL_MAP[trait.spad], F_spad1=BOOL_MAP[spad1], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
                                    F_deterministic=BOOL_MAP[trait.deterministic])
@@ -360,7 +360,7 @@ class FmhaBwdDQDKDVKernel:
            FMHA_BWD_DQ_DK_DV_KERNEL_BODY.format(
                F_idx           = self.F_idx,
                F_hdim          = self.F_hdim,
-                F_dtype         = DTYPE_MAP[self.F_dtype],
+                F_dtype         = BWD_DTYPE_MAP[self.F_dtype],
                F_bm0           = self.F_tile.F_bm0,
                F_bn0           = self.F_tile.F_bn0,
                F_bk0           = self.F_tile.F_bk0,
@@ -469,7 +469,7 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
    gen = list()
    api_pool = FmhaBwdApiPool(mask_impl)
-    for dtype in DTYPE_MAP.keys():
+    for dtype in BWD_DTYPE_MAP.keys():
        d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
        if d == None:
            continue
@@ -585,7 +585,7 @@ class FmhaBwdOGradDotOKernel:
            FMHA_BWD_DOT_DO_O_KERNEL_BODY.format(
                F_idx       = self.F_idx,
                F_hdim      = self.F_hdim,
-                F_dtype     = DTYPE_MAP[self.F_dtype],
+                F_dtype     = BWD_DTYPE_MAP[self.F_dtype],
                F_spad      = BOOL_MAP[self.F_spad],
                F_dvpad     = BOOL_MAP[self.F_dvpad],
                F_mode      = MODE_MAP[self.F_mode],
@@ -616,7 +616,7 @@ def get_bwd_dot_do_o_blobs() -> List[FmhaBwdOGradDotOKernel]:
    gen = list()
-    for dtype in DTYPE_MAP.keys():
+    for dtype in BWD_DTYPE_MAP.keys():
        d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
        if d == None:
            continue
@@ -716,7 +716,7 @@ class FmhaBwdConvertQGradKernel:
            FMHA_BWD_CONVERT_DQ_KERNEL_BODY.format(
                F_idx           = self.F_idx,
                F_hdim          = self.F_hdim,
-                F_dtype         = DTYPE_MAP[self.F_dtype],
+                F_dtype         = BWD_DTYPE_MAP[self.F_dtype],
                F_bm0           = self.F_bm0,
                F_bn0           = self.F_bn0,
                F_spad          = BOOL_MAP[self.F_spad],
@@ -751,7 +751,7 @@ def get_bwd_convert_dq_blobs() -> List[FmhaBwdConvertQGradKernel]:
    gen = list()
-    for dtype in DTYPE_MAP.keys():
+    for dtype in BWD_DTYPE_MAP.keys():
        d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
        if d == None:
            continue

--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -282,7 +282,7 @@ class FmhaFwdApiPool:
                                   F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
                                   F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
-                                   F_hdim=hdim, F_dtype=DTYPE_MAP[dtype])
+                                   F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
                if_j = 'if' if j == 0 else 'else if'
                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
            if_i = 'if' if i == 0 else 'else if'
@@ -339,7 +339,7 @@ class FmhaFwdKernel:
            FMHA_FWD_KERNEL_BODY.format(
                F_idx           = self.F_idx,
                F_hdim          = self.F_hdim,
-                F_dtype         = DTYPE_MAP[self.F_dtype],
+                F_dtype         = FWD_DTYPE_MAP[self.F_dtype],
                F_bm0           = self.F_tile.F_bm0,
                F_bn0           = self.F_tile.F_bn0,
                F_bk0           = self.F_tile.F_bk0,
@@ -462,6 +462,9 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
            # no need lse/dropout kernels
            for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 'f', 'f', squant, mask))
+        elif dtype in ['fp8fp16', 'fp8bf16']:
+            # TODO
+            None
        else:
            assert False
        return pipelines
@@ -469,7 +472,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
    gen = list()
    api_pool = FmhaFwdApiPool(mask_impl)
-    for dtype in DTYPE_MAP.keys():
+    for dtype in FWD_DTYPE_MAP.keys():
        d = get_fmha_fwd_tile_dict_from_dtype(dtype)
        if d == None:
            continue

--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
@@ -181,7 +181,7 @@ class FmhaFwdAppendKVApiPool:
                    inners = inners + FMHA_FWD_APPENDKV_API_INNER_DISPATCH.format(F_if=if_k, F_vlayout=LAYOUT_MAP[trait.vlayout],
                                   F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_rope_check=ROPE_CHECK_MAP[trait.rope],
                                   F_pagedkv=BOOL_MAP[trait.pagedkv], F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                   F_rope=ROPE_MAP[trait.rope], F_bs=trait.bs, F_bsk=trait.bsk, F_bd=trait.bd, F_bdv=trait.bdv, F_hdim=hdim, F_dtype=DTYPE_MAP[dtype])
+                                   F_rope=ROPE_MAP[trait.rope], F_bs=trait.bs, F_bsk=trait.bsk, F_bd=trait.bd, F_bdv=trait.bdv, F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
                if_j = 'if' if j == 0 else 'else if'
                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
            if_i = 'if' if i == 0 else 'else if'
@@ -216,7 +216,7 @@ class FmhaFwdAppendKVKernel:
            FMHA_FWD_APPENDKV_KERNEL_BODY.format(
                F_idx           = self.F_idx,
                F_hdim          = self.F_hdim,
-                F_dtype         = DTYPE_MAP[self.F_dtype],
+                F_dtype         = FWD_DTYPE_MAP[self.F_dtype],
                F_bs            = self.F_tile.F_bs,
                F_bsk           = self.F_tile.F_bsk,
                F_bd            = self.F_tile.F_bd,
@@ -301,6 +301,9 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
        elif dtype in ['fp8', 'bf8']:
            # rope/paged-kv is not supported
            pipelines.append(FmhaFwdAppendKVPipeline('col', 't', 't', 't', 't', 'no', 'f'))
+        elif dtype in ['fp8fp16', 'fp8bf16']:
+            # TODO
+            None
        else:
            assert False
        return pipelines
@@ -308,7 +311,7 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
    gen = list()
    api_pool = FmhaFwdAppendKVApiPool(mask_impl)
-    for dtype in DTYPE_MAP.keys():
+    for dtype in FWD_DTYPE_MAP.keys():
        d = get_fmha_fwd_appendkv_tile_dict_from_dtype(dtype)
        if d == None:
            continue

--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -435,7 +435,7 @@ class FmhaFwdSplitKVApiPool:
                                   F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
                                   F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
-                                   F_hdim=hdim, F_dtype=DTYPE_MAP[dtype])
+                                   F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
                if_j = 'if' if j == 0 else 'else if'
                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
            if_i = 'if' if i == 0 else 'else if'
@@ -472,7 +472,7 @@ class FmhaFwdSplitKVKernel:
            FMHA_FWD_SPLITKV_KERNEL_BODY.format(
                F_idx           = self.F_idx,
                F_hdim          = self.F_hdim,
-                F_dtype         = DTYPE_MAP[self.F_dtype],
+                F_dtype         = FWD_DTYPE_MAP[self.F_dtype],
                F_bm0           = self.F_tile.F_bm0,
                F_bn0           = self.F_tile.F_bn0,
                F_bk0           = self.F_tile.F_bk0,
@@ -552,7 +552,7 @@ class FmhaFwdSplitKVCombineKernel:
            FMHA_FWD_SPLITKV_COMBINE_KERNEL_BODY.format(
                F_idx           = self.F_idx,
                F_hdim          = self.F_hdim,
-                F_dtype         = DTYPE_MAP[self.F_dtype],
+                F_dtype         = FWD_DTYPE_MAP[self.F_dtype],
                F_bm0           = self.F_tile.F_bm0,
                F_bn1           = self.F_tile.F_bn1,
                F_spad          = BOOL_MAP[self.F_pipeline.F_spad],
@@ -644,6 +644,9 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
        elif dtype in ['fp8', 'bf8']:
            for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
                pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 't', squant, 'f', mask))
+        elif dtype in ['fp8fp16', 'fp8bf16']:
+            # TODO
+            None
        else:
            assert False
        return pipelines
@@ -651,7 +654,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
    gen = list()
    api_pool = FmhaFwdSplitKVApiPool(mask_impl)
-    for dtype in DTYPE_MAP.keys():
+    for dtype in FWD_DTYPE_MAP.keys():
        d = get_fmha_fwd_tile_dict_from_dtype(dtype)
        if d == None:
            continue
@@ -711,7 +714,7 @@ def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt) -> Lis
    gen = list()
-    for dtype in DTYPE_MAP.keys():
+    for dtype in FWD_DTYPE_MAP.keys():
        d = get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype)
        if d == None:
            continue

--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
@@ -101,7 +101,7 @@ auto create_args(int argc, char* argv[])
 }
 // different threshold for different dtype
-template <typename DataType>
+template <typename DataTypeConfig>
 auto get_elimit(ck_tile::index_t /*hdim_q*/, ck_tile::index_t /*hdim_v*/)
 {
    double rtol = 1e-2;
@@ -110,7 +110,7 @@ auto get_elimit(ck_tile::index_t /*hdim_q*/, ck_tile::index_t /*hdim_v*/)
 }
 template <>
-auto get_elimit<ck_tile::bf16_t>(ck_tile::index_t hdim_q, ck_tile::index_t hdim_v)
+auto get_elimit<FmhaBwdBf16>(ck_tile::index_t hdim_q, ck_tile::index_t hdim_v)
 {
    double rtol = 1e-2;
    double atol = 1e-2;
@@ -122,7 +122,7 @@ auto get_elimit<ck_tile::bf16_t>(ck_tile::index_t hdim_q, ck_tile::index_t hdim_
    return ck_tile::make_tuple(rtol, atol);
 }
-template <typename DataType>
+template <typename DataTypeConfig>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
    std::string data_type    = arg_parser.get_str("prec");
@@ -209,7 +209,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    const auto seqstart_q_host = generate_seqstarts(mode, batch, seqlen_q);
    const auto seqstart_k_host = generate_seqstarts(mode, batch, seqlen_k);
-    using TypeConfig = FmhaBwdTypeConfig<DataType>;
+    using TypeConfig = FmhaBwdTypeConfig<DataTypeConfig>;
    using QDataType             = typename TypeConfig::QDataType;
    using KDataType             = typename TypeConfig::KDataType;
@@ -933,7 +933,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        }
        // clang-format on
-        auto [rtol, atol] = get_elimit<DataType>(hdim_q, hdim_v);
+        auto [rtol, atol] = get_elimit<DataTypeConfig>(hdim_q, hdim_v);
        bool dq_cur_pass  = ck_tile::check_err(dq_host_result,
                                              dq_host_ref,
                                              std::string("Error: QGrad Incorrect results!"),
@@ -986,11 +986,11 @@ int main(int argc, char* argv[])
    const std::string data_type = arg_parser.get_str("prec");
    if(data_type == "fp16")
    {
-        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+        return run<FmhaBwdFp16>(arg_parser) ? 0 : -2;
    }
    else if(data_type == "bf16")
    {
-        return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
+        return run<FmhaBwdBf16>(arg_parser) ? 0 : -2;
    }
    return -3;

--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -14,11 +14,19 @@
 #include <utility>
 #include <variant>
+struct FmhaBwdFp16
+{
+};
+struct FmhaBwdBf16
+{
+};
 template <typename DataType>
 struct FmhaBwdTypeConfig;
 template <>
-struct FmhaBwdTypeConfig<ck_tile::half_t>
+struct FmhaBwdTypeConfig<FmhaBwdFp16>
 {
    using QDataType             = ck_tile::half_t;
    using KDataType             = ck_tile::half_t;
@@ -38,7 +46,7 @@ struct FmhaBwdTypeConfig<ck_tile::half_t>
 };
 template <>
-struct FmhaBwdTypeConfig<ck_tile::bf16_t>
+struct FmhaBwdTypeConfig<FmhaBwdBf16>
 {
    using QDataType             = ck_tile::bf16_t;
    using KDataType             = ck_tile::bf16_t;

--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -3,6 +3,7 @@
 #include "fmha_fwd.hpp"
 #include "ck_tile/host.hpp"
+#include "ck_tile/ref/naive_attention.hpp"
 #include "mask.hpp"
 #include "rotary.hpp"
 #include "utils.hpp"
@@ -41,7 +42,7 @@ std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
 auto create_args(int argc, char* argv[])
 {
    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("v", "1", "weather do CPU validation or not")
+    arg_parser.insert("v", "1", "0:no validation, 2:cpu validation, 2:gpu validation(experimental)")
        .insert("mode", "0", "kernel mode. 0:batch, 1:group")
        .insert("b", "2", "batch size")
        .insert("h", "8", "num of head, for q")
@@ -142,7 +143,7 @@ auto create_args(int argc, char* argv[])
 }
 // different threshold for different dtype
-template <typename DataType>
+template <typename DataTypeConfig>
 auto get_elimit(std::string /*init_method*/)
 {
    double rtol = 1e-3;
@@ -151,7 +152,7 @@ auto get_elimit(std::string /*init_method*/)
 }
 template <>
-auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
+auto get_elimit<FmhaFwdBf16>(std::string /*init_method*/)
 {
    double rtol = 1e-2;
    double atol = 1e-2;
@@ -159,7 +160,7 @@ auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
 }
 template <>
-auto get_elimit<ck_tile::fp8_t>(std::string init_method)
+auto get_elimit<FmhaFwdFp8>(std::string init_method)
 {
    if(init_method == "ui" || init_method == "ni")
    {
@@ -261,7 +262,7 @@ int override_num_splits_if_necessary(
    return num_splits;
 }
-template <typename DataType>
+template <typename DataTypeConfig>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
    std::string data_type    = arg_parser.get_str("prec");
@@ -305,8 +306,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    }
    ck_tile::index_t rotary_dim = arg_parser.get_int("rotary_dim");
-    if constexpr(!(std::is_same_v<DataType, ck_tile::fp16_t> ||
+    if constexpr(!(std::is_same_v<DataTypeConfig, FmhaFwdFp16> ||
-                   std::is_same_v<DataType, ck_tile::bf16_t>))
+                   std::is_same_v<DataTypeConfig, FmhaFwdBf16>))
    {
        if(0 < rotary_dim)
        {
@@ -428,25 +429,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
            return atoi(squant_str.c_str()) != 0 ? true : false;
    }();
-    float range_q = arg_parser.get_float("range_q");
-    float range_k = arg_parser.get_float("range_k");
-    float range_v = arg_parser.get_float("range_v");
-    float range_p = arg_parser.get_float("range_p");
-    float range_o = arg_parser.get_float("range_o");
-    float dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<DataType>::max());
-    float scale_p = 1.f;
-    float scale_o = 1.f;
-    if(squant)
-    {
-        scale_s = scale_s * (range_q / dtype_max) * (range_k / dtype_max);
-        scale_p = dtype_max / range_p;
-        // scale_p = [max(fp8_t)/range_o] * [range_p/max(fp8_t)] * [range_v/max(fp8_t)]
-        scale_o = range_p * range_v / range_o / dtype_max;
-    }
    std::string vlayout = arg_parser.get_str("vlayout");
    bool lse            = arg_parser.get_bool("lse");
@@ -466,7 +448,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    }
    bool s_randval = false;
-    if(p_drop > 0.0f && do_validation)
+    if(p_drop > 0.0f && do_validation != 0)
    {
        s_randval = true;
    }
@@ -499,7 +481,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    const auto seqstart_k_host              = to_seqstarts(seqlen_ks);
    const auto seqstart_k_with_padding_host = to_seqstarts(seqlen_kpads);
-    using TypeConfig = FmhaFwdTypeConfig<DataType>;
+    using TypeConfig = FmhaFwdTypeConfig<DataTypeConfig>;
    using QDataType             = typename TypeConfig::QDataType;
    using KDataType             = typename TypeConfig::KDataType;
@@ -513,6 +495,28 @@ bool run(const ck_tile::ArgParser& arg_parser)
    using OaccDataType          = typename TypeConfig::OaccDataType;
    using ODataType             = typename TypeConfig::ODataType;
+    float range_q = arg_parser.get_float("range_q");
+    float range_k = arg_parser.get_float("range_k");
+    float range_v = arg_parser.get_float("range_v");
+    float range_p = arg_parser.get_float("range_p");
+    float range_o = arg_parser.get_float("range_o");
+    float q_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<QDataType>::max());
+    float k_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<KDataType>::max());
+    float v_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<VDataType>::max());
+    float p_dtype_max = v_dtype_max; // assume p and v is the same type
+    float o_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<ODataType>::max());
+    float scale_p = 1.f;
+    float scale_o = 1.f;
+    if(squant)
+    {
+        scale_s = scale_s * (range_q / q_dtype_max) * (range_k / k_dtype_max);
+        scale_p = p_dtype_max / range_p;
+        scale_o = (o_dtype_max / range_o) * (range_p / p_dtype_max) * (range_v / v_dtype_max);
+    }
    // accumulation numbers for performance evaluation
    std::size_t flop = 0, num_byte = 0;
    auto max_seqlen_q =
@@ -709,14 +713,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
    else if(init_method == "ufq" || init_method == "uf:q" ||
            init_method == "3") // suitable for fp8 quantization
    {
-        ck_tile::FillUniformDistribution<QDataType>{-dtype_max, dtype_max, seed}(q_host);
+        ck_tile::FillUniformDistribution<QDataType>{-q_dtype_max, q_dtype_max, seed}(q_host);
-        ck_tile::FillUniformDistribution<KDataType>{-dtype_max, dtype_max, seed}(k_host);
+        ck_tile::FillUniformDistribution<KDataType>{-k_dtype_max, k_dtype_max, seed}(k_host);
-        ck_tile::FillUniformDistribution<KDataType>{-dtype_max, dtype_max, seed}(knew_host);
+        ck_tile::FillUniformDistribution<KDataType>{-k_dtype_max, k_dtype_max, seed}(knew_host);
-        ck_tile::FillUniformDistribution<VDataType>{-dtype_max, dtype_max, seed}(v_host);
+        ck_tile::FillUniformDistribution<VDataType>{-v_dtype_max, v_dtype_max, seed}(v_host);
-        ck_tile::FillUniformDistribution<VDataType>{-dtype_max, dtype_max, seed}(vnew_host);
+        ck_tile::FillUniformDistribution<VDataType>{-v_dtype_max, v_dtype_max, seed}(vnew_host);
        // bias_fp8 = qscale_bias * bias_fp32
-        float qscale_bias = (dtype_max / range_q) * (dtype_max / range_k);
+        float qscale_bias = (q_dtype_max / range_q) * (k_dtype_max / range_k);
        // Assume bias is in [-1.f, 1.f] in original fp32
        ck_tile::FillUniformDistribution<BiasDataType>{-qscale_bias, qscale_bias, seed}(bias_host);
    }
@@ -1118,25 +1122,75 @@ bool run(const ck_tile::ArgParser& arg_parser)
              << std::setprecision(2) << tflops << " TFlops, " << std::setprecision(2) << gb_per_sec
              << " GB/s" << std::flush;
-    if(!do_validation)
+    if(do_validation == 0)
    {
        std::cout << std::flush << std::endl;
        return true;
    }
+    if(do_validation == 2)
+    {
+        // NOTE: use gpu to do validation
+        ck_tile::naive_attention_fwd_traits naive_t;
+        naive_t.q_type    = data_type;
+        naive_t.k_type    = data_type;
+        naive_t.v_type    = data_type;
+        naive_t.o_type    = data_type;
+        naive_t.q_layout  = i_perm == 1 ? "bhsd" : "bshd";
+        naive_t.k_layout  = i_perm == 1 ? "bhsd" : "bshd";
+        naive_t.v_layout  = i_perm == 1 ? "bhsd" : "bshd";
+        naive_t.o_layout  = o_perm == 1 ? "bhsd" : "bshd";
+        naive_t.variation = 0; // TODO?
+        ck_tile::DeviceMem o_naive_buf(o_host.get_element_space_size_in_bytes());
+        ck_tile::naive_attention_fwd_args naive_a;
+        naive_a.q_ptr           = q_buf.GetDeviceBuffer();
+        naive_a.k_ptr           = k_buf.GetDeviceBuffer();
+        naive_a.v_ptr           = v_buf.GetDeviceBuffer();
+        naive_a.o_ptr           = o_naive_buf.GetDeviceBuffer();
+        naive_a.scale_s         = scale_s;
+        naive_a.context_len_ptr = nullptr; // used when seqlen kv come from a pointer
+        naive_a.page_table_ptr =
+            nullptr; // [batch, num_blocks] seqlen_kv is in different block(paged attn)
+        naive_a.hdim           = hdim_q;
+        naive_a.hdim_v         = hdim_v; // could be cross-attn, where V and Q/K hdim are different
+        naive_a.batch_q        = batch;
+        naive_a.batch_kv       = batch;
+        naive_a.batch_ratio_kv = 1; // batch_q / batch_kv
+        naive_a.seqlen_q       = seqlen_qs[0];
+        naive_a.seqlen_kv = seqlen_ks[0]; // if context_len_ptr is not nullptr, ignore this field
+        naive_a.nhead_q   = nhead;
+        naive_a.nhead_kv  = nhead_k;
+        naive_a.nhead_ratio_kv = naive_a.nhead_q / naive_a.nhead_kv; // nhead_q / nhead_kv
+        naive_a.page_size      = 0; // if paged, the seqlen-kv for each block
+        ck_tile::stream_config naive_s{};
+        naive_attention_fwd(naive_t, naive_a, naive_s);
+        auto o_naive_ref = o_naive_buf.ToHost<ODataType>();
+        o_buf.FromDevice(o_host.data()); // TODO: ugly
+        auto [rtol_, atol_] = get_elimit<DataTypeConfig>(init_method);
+        bool pass_          = ck_tile::check_err(
+            o_host, o_naive_ref, std::string("OUT Error: Incorrect results!"), rtol_, atol_);
+        std::cout << ", valid:" << (pass_ ? "y" : "n") << std::flush << std::endl;
+        return pass_;
+    }
    o_buf.FromDevice(o_host.data());
    lse_buf.FromDevice(lse_host.data());
    randval_buf.FromDevice(randval_host.data());
    auto p_compute_element_func = [&]() {
-        if constexpr(std::is_same_v<DataType, ck_tile::fp8_t>)
+        if constexpr(std::is_same_v<DataTypeConfig, ck_tile::fp8_t>)
            return ck_tile::scales{scale_p};
        else
            return ck_tile::identity{};
    }();
    auto oacc_element_func = [&]() {
-        if constexpr(std::is_same_v<DataType, ck_tile::fp8_t>)
+        if constexpr(std::is_same_v<DataTypeConfig, ck_tile::fp8_t>)
            return ck_tile::composes(ck_tile::saturates<ck_tile::fp8_t>{},
                                     ck_tile::scales{scale_o});
        else
@@ -1458,7 +1512,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        else       o_host_result.ForEach([&](auto& self, auto idx) { self(idx) = o_host(b_idx, idx[1] + query_offset, idx[0], idx[2]); });
        // clang-format on
-        auto [rtol, atol] = get_elimit<DataType>(init_method);
+        auto [rtol, atol] = get_elimit<DataTypeConfig>(init_method);
        bool cur_pass     = ck_tile::check_err(
            o_host_result, o_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
        pass &= cur_pass;
@@ -1515,15 +1569,15 @@ int main(int argc, char* argv[])
    const std::string data_type = arg_parser.get_str("prec");
    if(data_type == "fp16")
    {
-        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+        return run<FmhaFwdFp16>(arg_parser) ? 0 : -2;
    }
    else if(data_type == "bf16")
    {
-        return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
+        return run<FmhaFwdBf16>(arg_parser) ? 0 : -2;
    }
    else if(data_type == "fp8")
    {
-        return run<ck_tile::fp8_t>(arg_parser) ? 0 : -2;
+        return run<FmhaFwdFp8>(arg_parser) ? 0 : -2;
    }
    return -3;

--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -16,11 +16,35 @@
 #include <utility>
 #include <variant>
+struct FmhaFwdFp16
+{
+};
+struct FmhaFwdBf16
+{
+};
+struct FmhaFwdFp8
+{
+};
+struct FmhaFwdBf8
+{
+};
+struct FmhaFwdFp8Fp16
+{
+};
+struct FmhaFwdFp8Bf16
+{
+};
 template <typename DataType>
 struct FmhaFwdTypeConfig;
 template <>
-struct FmhaFwdTypeConfig<ck_tile::half_t>
+struct FmhaFwdTypeConfig<FmhaFwdFp16>
 {
    using QDataType             = ck_tile::half_t;
    using KDataType             = ck_tile::half_t;
@@ -36,7 +60,7 @@ struct FmhaFwdTypeConfig<ck_tile::half_t>
 };
 template <>
-struct FmhaFwdTypeConfig<ck_tile::bf16_t>
+struct FmhaFwdTypeConfig<FmhaFwdBf16>
 {
    using QDataType             = ck_tile::bf16_t;
    using KDataType             = ck_tile::bf16_t;
@@ -52,7 +76,7 @@ struct FmhaFwdTypeConfig<ck_tile::bf16_t>
 };
 template <>
-struct FmhaFwdTypeConfig<ck_tile::fp8_t>
+struct FmhaFwdTypeConfig<FmhaFwdFp8>
 {
    using QDataType             = ck_tile::fp8_t;
    using KDataType             = ck_tile::fp8_t;
@@ -68,7 +92,7 @@ struct FmhaFwdTypeConfig<ck_tile::fp8_t>
 };
 template <>
-struct FmhaFwdTypeConfig<ck_tile::bf8_t>
+struct FmhaFwdTypeConfig<FmhaFwdBf8>
 {
    using QDataType             = ck_tile::bf8_t;
    using KDataType             = ck_tile::bf8_t;

--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -611,7 +611,7 @@ inline __device__ int8_t neg<int8_t>(int8_t x)
 template <>
 inline __device__ half_t neg<half_t>(half_t x)
 {
-    return __hneg(x);
+    return __hneg(static_cast<__half>(x));
 };
 template <typename T>

--- a/include/ck_tile/README.md
+++ b/include/ck_tile/README.md
@@ -45,5 +45,8 @@ our implementation of different device operators.
 **[ops/epilogue]**  
 epilogue part of our kernel. We may extend this epilogue part to let users to build their own cutomized epilogues.
+**[ref]**  
+reference implementation of cpu or gpu. This folder is supposed to include a specific header on demand.
 ## examples
 currently we put all ck_tile related example under [/example/ck_tile](/example/ck_tile/) folder. Please check each example's subfolder.
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -54,6 +54,7 @@
 #include "ck_tile/core/tensor/tile_window_linear.hpp"
 #include "ck_tile/core/tensor/tile_window_utils.hpp"
 #include "ck_tile/core/tensor/update_tile.hpp"
+#include "ck_tile/core/utility/amd_address_space.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/functional_with_tuple.hpp"

--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -23,10 +23,10 @@
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_problem.hpp"
 #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp"
+#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
 #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
-#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"

--- a/include/ck_tile/ref/README.md
+++ b/include/ck_tile/ref/README.md
+# reference
+this folder contains reference implementation of a specific op. Note by including a specific header, you are including the implementation(expecially the gpu implementation) into your source code, and compile that kernel into the fatbin, hence may increase your kernel obj code length. Usually the header starts with `reference_` is a cpu reference implementation. The header starts with `naive_` contains a gpu implementation with a small launcher.
+TODO: move `host/reference` under this folder
--- a/include/ck_tile/ref/naive_attention.hpp
+++ b/include/ck_tile/ref/naive_attention.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include <thread>
+#include <string>
+namespace ck_tile {
+enum class naive_attention_layout_enum
+{
+    BSHD,  // [batch, seqlen, nhead, hdim]
+    BHSD,  // [batch, nhead, seqlen, hdim]
+    BS3HD, // [batch, nhead, 3, seqlen, hdim], used when qkv are packed
+    PHSD,  // [pages, nhead, page_size, hdim]
+    // PHSDX, // [pages, nhead, page_size/x, hdim, x], where <# used pages>*page_size = seqlen
+    PHDSX, // [pages, nhead, hdim/x, page_size, x], where <# used pages>*page_size = seqlen
+    PHDS,  // [pages, nhead, hdim, page_size], where <# used pages>*page_size = seqlen
+};
+// will used to specialize kernel variation
+enum class naive_attention_variation_enum
+{
+    FLASH_BATCHED = 0, // standard flash attention, or xformer/sdpa, used for training
+    FLASH_GROUPED,
+    DECODE_PAGED, // decode attn, where kv token from another buffer called kvcache
+};
+// TODO: for simplicity, this will be used as host/device arg
+struct naive_attention_fwd_args
+{
+    void* q_ptr;
+    void* k_ptr;
+    void* v_ptr;
+    void* o_ptr;
+    void* context_len_ptr; // [batch] used when seqlen kv come from a pointer(each element is a
+                           // number, not cumsum)
+    void* page_table_ptr;  // [batch, max_pages_per_seq] seqlen_kv is in different block(paged attn)
+    void* kvscale_ptr;     // [nhead, 2(kv), hdim] used for kvcache dequant
+    float scale_s;
+    int hdim;
+    int hdim_v; // could be cross-attn, where V and Q/K hdim are different
+    int batch_q;
+    int batch_kv;
+    int batch_ratio_kv; // batch_q / batch_kv
+    int seqlen_q;       // in decode case, this should be 1
+    int seqlen_kv;      // if context_len_ptr is not nullptr, ignore this field
+    int nhead_q;
+    int nhead_kv;
+    int nhead_ratio_kv; // nhead_q / nhead_kv
+    int page_size;      // if paged, the seqlen-kv per each block
+    int max_pages_per_seq;
+};
+// this is trait for host API
+struct naive_attention_fwd_traits
+{
+    std::string q_type;
+    std::string k_type;
+    std::string v_type;
+    std::string o_type;
+    std::string q_layout;
+    std::string k_layout;
+    std::string v_layout;
+    std::string o_layout;
+    int variation; // sync with naive_attention_variation_enum
+};
+// this is trait for kernel template
+template <naive_attention_variation_enum variation_>
+struct naive_attention_fwd_kernel_traits
+{
+    static constexpr naive_attention_variation_enum variation = variation_;
+};
+// for simplicity, please do not use const-reference type for the template type
+template <typename QType,
+          typename KType,
+          typename VType,
+          typename OType,
+          typename AccType,
+          naive_attention_layout_enum QLayout,
+          naive_attention_layout_enum KLayout,
+          naive_attention_layout_enum VLayout,
+          naive_attention_layout_enum OLayout,
+          typename Traits>
+struct naive_attention_fwd_kernel
+{
+    static constexpr bool is_kvcache_i8 =
+        std::is_same_v<KType, int8_t> && std::is_same_v<VType, int8_t> && sizeof(QType) != 1;
+    // kvcache-i8 will have per head scale, we apply this scale to Q/P matrix instead of original
+    // K/V matrix. This can speed up conversion since Q/P usually is fp16/bf16/fp32
+    static constexpr bool is_kvcache_i8_forward_quant = is_kvcache_i8;
+    // TODO: hardcode
+    using KVScaleType = float;
+    using SoftmaxType = float;
+    using PType       = VType; // src A of gemm2, same type as V
+    using p_vec_type                = ext_vector_t<PType, 16 / sizeof(PType)>;
+    static constexpr int p_vec_elem = vector_traits<p_vec_type>::vector_size;
+    __host__ __device__ naive_attention_fwd_kernel() {}
+    template <typename T, naive_attention_layout_enum Layout>
+    struct addresser
+    {
+        int b, s, h, d; // batch, seqlen, nhead, hdim
+        T* base_ptr;
+        __device__ addresser(int b_, int s_, int h_, int d_, void* base_ptr_)
+            : b(b_), s(s_), h(h_), d(d_), base_ptr(reinterpret_cast<T*>(base_ptr_))
+        {
+        }
+        // TODO: all the batch/nhead offset will accumulate to the base pointer
+        __device__ T* get_base(int i_b, int i_h)
+        {
+            if constexpr(Layout == naive_attention_layout_enum::BSHD)
+                return base_ptr + i_b * s * h * d + i_h * d;
+            else if constexpr(Layout == naive_attention_layout_enum::BHSD)
+                return base_ptr + i_b * s * h * d + i_h * s * d;
+        }
+        __device__ int get_offset(int i_s, int i_d)
+        {
+            if constexpr(Layout == naive_attention_layout_enum::BSHD)
+                return i_s * h * d + i_d;
+            else if constexpr(Layout == naive_attention_layout_enum::BHSD)
+                return i_s * d + i_d;
+        }
+        // below set of API will directly use pointer inside this struct
+        __device__ void init(int i_b, int i_h) { base_ptr = get_base(i_b, i_h); }
+        __device__ T load(int i_s, int i_d) { return base_ptr[get_offset(i_s, i_d)]; }
+        __device__ void store(T value, int i_s, int i_d) { base_ptr[get_offset(i_s, i_d)] = value; }
+    };
+    template <typename T, naive_attention_layout_enum Layout>
+    struct page_addresser
+    {
+        int s, h, d;                             // page_size, nhead, hdim
+        static constexpr int x = 16 / sizeof(T); // pack 4 dword
+        T* base_ptr;
+        int* page_table_ptr; // TODO: page table always int
+        int i_h;             // store current head
+        __device__ page_addresser(int s_, int h_, int d_, void* base_ptr_, void* pptr_)
+            : s(s_),
+              h(h_),
+              d(d_),
+              base_ptr(reinterpret_cast<T*>(base_ptr_)),
+              page_table_ptr(reinterpret_cast<int*>(pptr_))
+        {
+        }
+        __device__ int64_t get_phy_page_idx(int i_s)
+        {
+            // dynamic compute page idx is simple but slow
+            int page_idx = i_s / s;
+            int phy      = page_table_ptr[page_idx];
+            return static_cast<int64_t>(phy);
+        }
+        __device__ int get_phy_page_offset(int i_s)
+        {
+            // dynamic compute page idx is simple but slow
+            return i_s % s;
+        }
+        __device__ int64_t get_offset(int i_s, int i_d)
+        {
+            int page_offset  = get_phy_page_offset(i_s);
+            int64_t page_idx = get_phy_page_idx(i_s);
+            int64_t base_    = page_idx * h * s * d;
+            if constexpr(Layout == naive_attention_layout_enum::PHSD)
+                return static_cast<int64_t>(i_h * s * d + page_offset * d + i_d) + base_;
+            else if constexpr(Layout == naive_attention_layout_enum::PHDSX)
+            {
+                int d_r = i_d / x;
+                int d_x = i_d % x;
+                return static_cast<int64_t>(i_h * d * s + d_r * s * x + page_offset * x + d_x) +
+                       base_;
+            }
+            else if constexpr(Layout == naive_attention_layout_enum::PHDS)
+            {
+                return static_cast<int64_t>(i_h * d * s + i_d * s + page_offset) + base_;
+            }
+        }
+        // below set of API will directly use pointer inside this struct
+        __device__ void init(int /*i_b*/, int i_h_) { i_h = i_h_; }
+        __device__ T load(int i_s, int i_d) { return base_ptr[get_offset(i_s, i_d)]; }
+        __device__ void store(T /*value*/, int /*i_s*/, int /*i_d*/) {}
+    };
+    template <typename T>
+    struct kvscale_addresser
+    {
+        int h, d; // nhead, hdim
+        T* base_ptr;
+        __device__ kvscale_addresser(int h_, int d_, void* p_)
+            : h(h_), d(d_), base_ptr(reinterpret_cast<T*>(p_))
+        {
+        }
+        __device__ int get_offset(int i_h, int i_d, int i_kv /*0 or 1*/)
+        {
+            // [h, 2, d]
+            return i_h * 2 * d + i_kv * d + i_d;
+        }
+        __device__ T load(int i_h, int i_d, int i_kv)
+        {
+            return base_ptr[get_offset(i_h, i_d, i_kv)];
+        }
+    };
+    __device__ __host__ static constexpr int get_block_size() { return 256; }
+    // for simpliciy, 1 WG always compute 1 token along q, compute all token along kv
+    // compute all hdim from q, compute WG_SIZE hdim from v
+    // 1) in prefill case, seqlen_q >= 1, seqlen_kv >= 1, batch_q=batch_kv
+    // 2) in decode case, seqlen_q = 1, batch_q is input num-tokens, batch_kv is 1
+    // 3) in paged-attn case, we still use 1 WG compute all the seqlen-kv for simplicity
+    // TODO: could support split-kv to validate intermediate logsum
+    __host__ static dim3 get_grid_size(naive_attention_fwd_args args)
+    {
+        constexpr int wg_size = get_block_size();
+        auto g =
+            dim3((args.hdim_v + wg_size - 1) / wg_size, args.seqlen_q, args.batch_q * args.nhead_q);
+        return g;
+    }
+    // reduce single pixel within a wave
+    template <typename T, typename F>
+    __device__ constexpr T wave_reduce(T local, F reduce_f)
+    {
+        // constexpr int wave_size = 64;
+        constexpr int reduce_stage = 6; // 1<<6=64
+        T v_local                  = local;
+#pragma unroll
+        for(int i_stage = 0; i_stage < reduce_stage; i_stage++)
+        {
+            int src_lane = __lane_id() ^ (1 << i_stage);
+            int32_t v_remote_tmp =
+                __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(v_local));
+            T v_remote = bit_cast<T>(v_remote_tmp);
+            v_local    = reduce_f(v_local, v_remote);
+        }
+        return v_local;
+    }
+    // Note: this function must be called after wave_reduce
+    // Note: better not use this under if...else... with thread divergence (syncthreads)
+    template <typename T, typename F>
+    __device__ constexpr T cross_wave_reduce(T local, F reduce_f, T* smem)
+    {
+        constexpr int waves     = 4;
+        constexpr int wave_size = 64;
+        int lane_id             = threadIdx.x % wave_size;
+        __syncthreads();
+        smem[threadIdx.x] = local;
+        __syncthreads();
+        // the data within single wave is the same
+        // but for simplicity, we still use data from each lane.
+        T v_local = smem[lane_id];
+#pragma unroll
+        for(int i_stage = 1; i_stage < waves; i_stage++)
+        {
+            T v_remote = smem[i_stage * wave_size + lane_id];
+            v_local    = reduce_f(v_local, v_remote);
+        }
+        return v_local;
+    }
+    // kernel entry point
+    __device__ void operator()(naive_attention_fwd_args args)
+    {
+        constexpr int wg_size = get_block_size();
+        __shared__ char smem[wg_size * 4 * sizeof(float)]; //  should enough
+        int i_dv    = blockIdx.x * wg_size + threadIdx.x;  // index of hdim_v
+        int i_sq    = blockIdx.y;                          // index of seqlen_q
+        int i_batch = blockIdx.z;                          // index of batch_q * nhead_q
+        int i_bq    = i_batch / args.nhead_q;              // index of batch_q
+        int i_hq    = i_batch % args.nhead_q;              // index of nhead_q
+        int i_bk = i_bq / args.batch_ratio_kv;
+        int i_hk = i_hq / args.nhead_ratio_kv;
+        void* page_table_ptr = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return reinterpret_cast<int*>(args.page_table_ptr) + i_bq * args.max_pages_per_seq;
+            }
+            else
+            {
+                return nullptr;
+            }
+        }();
+        auto q_addr = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED)
+            {
+                return addresser<QType, QLayout>{
+                    args.batch_q, args.seqlen_q, args.nhead_q, args.hdim, args.q_ptr};
+            }
+            else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return addresser<QType, QLayout>{
+                    args.batch_q, args.seqlen_q, args.nhead_q, args.hdim, args.q_ptr};
+            }
+        }();
+        auto k_addr = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED)
+            {
+                return addresser<KType, KLayout>{
+                    args.batch_kv, args.seqlen_kv, args.nhead_kv, args.hdim, args.k_ptr};
+            }
+            else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return page_addresser<KType, KLayout>{
+                    args.page_size, args.nhead_kv, args.hdim, args.k_ptr, page_table_ptr};
+            }
+        }();
+        auto v_addr = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED)
+            {
+                return addresser<VType, VLayout>{
+                    args.batch_kv, args.seqlen_kv, args.nhead_kv, args.hdim_v, args.v_ptr};
+            }
+            else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return page_addresser<VType, VLayout>{
+                    args.page_size, args.nhead_kv, args.hdim_v, args.v_ptr, page_table_ptr};
+            }
+        }();
+        auto o_addr = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED)
+            {
+                return addresser<OType, OLayout>{
+                    args.batch_q, args.seqlen_q, args.nhead_q, args.hdim_v, args.o_ptr};
+            }
+            else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return addresser<OType, OLayout>{
+                    args.batch_q, args.seqlen_q, args.nhead_q, args.hdim_v, args.o_ptr};
+            }
+        }();
+        q_addr.init(i_bq, i_hq);
+        k_addr.init(i_bk, i_hk);
+        v_addr.init(i_bk, i_hk);
+        o_addr.init(i_bq, i_hq);
+        auto f_max        = [](auto x_, auto y_) { return max(x_, y_); };
+        auto f_sum        = [](auto x_, auto y_) { return x_ + y_; };
+        auto f_absmax_f32 = [](float v_0_, float v_1_) {
+            float rtn;
+            asm volatile("v_max_f32 %0, abs(%1), abs(%2)" : "=v"(rtn) : "v"(v_0_), "v"(v_1_));
+            return rtn;
+        };
+        int seqlen_kv = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED)
+            {
+                return args.seqlen_kv;
+            }
+            else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return reinterpret_cast<int*>(args.context_len_ptr)[i_bq];
+            }
+        }();
+        SoftmaxType row_max = -numeric<SoftmaxType>::infinity();
+        SoftmaxType l{0};
+        AccType o_acc = {0};
+        int sk_loops   = (seqlen_kv + wg_size - 1) / wg_size;
+        float qf_scale = .0f;
+        kvscale_addresser<KVScaleType> kvscale_addr{args.nhead_kv, args.hdim, args.kvscale_ptr};
+        if constexpr(is_kvcache_i8_forward_quant)
+        {
+            // AccType is i32 now, seqlen_q = 1, hdim up to 256
+            float q   = 0;
+            float k_s = 0;
+            if(static_cast<int>(threadIdx.x) < args.hdim)
+            {
+                q   = type_convert<float>(q_addr.load(0, threadIdx.x));
+                k_s = type_convert<float>(kvscale_addr.load(i_hk, threadIdx.x, 0));
+            }
+            // 1) we apply the k scale to q
+            float q_forwarded = q * k_s;
+            // 2) apply smooth-quant
+            // find absmax
+            float qf_max = wave_reduce(q_forwarded, f_absmax_f32);
+            qf_max       = cross_wave_reduce(qf_max, f_absmax_f32, reinterpret_cast<float*>(smem));
+            // per-token scale
+            qf_scale = qf_max / 127.0;
+            // devide by scale
+            q = q / qf_scale;
+            // fp32->i8
+            int8_t quantized_q = static_cast<int8_t>(q);
+            __syncthreads();
+            reinterpret_cast<int8_t*>(smem)[threadIdx.x] = quantized_q;
+            __syncthreads();
+            // after above process, we have 2 data
+            // 1) int8 q data stored in smem(no need to reload)
+            // 2) per-token scale qf_scale, to be mul after 1st gemm
+        }
+        for(int i_loop1 = 0; i_loop1 < sk_loops; i_loop1++)
+        {
+            int i_sk = i_loop1 * wg_size + threadIdx.x;
+            // gemm-1
+            SoftmaxType s_softmax = -numeric<SoftmaxType>::infinity();
+            if(i_sk < seqlen_kv)
+            {
+                AccType s_acc{0}; // clear for every loop
+                for(auto i_dq = 0; i_dq < args.hdim; i_dq++)
+                {
+                    if constexpr(is_kvcache_i8_forward_quant)
+                    {
+                        int8_t q = reinterpret_cast<int8_t*>(smem)[i_dq];
+                        auto k   = k_addr.load(i_sk, i_dq);
+                        s_acc += type_convert<AccType>(q) * type_convert<AccType>(k);
+                    }
+                    else
+                    {
+                        auto q = q_addr.load(i_sq, i_dq); // q will have duplicate load
+                        auto k = k_addr.load(i_sk, i_dq);
+                        s_acc += type_convert<AccType>(q) * type_convert<AccType>(k);
+                    }
+                }
+                // scale
+                s_softmax = type_convert<SoftmaxType>(s_acc);
+                s_softmax *=
+                    type_convert<SoftmaxType>(args.scale_s * ck_tile::log2e_v<SoftmaxType>);
+                if constexpr(is_kvcache_i8_forward_quant)
+                {
+                    s_softmax *= qf_scale; // post scale the per-token factor
+                }
+            }
+            // s->p
+            float pf_scale = 0.; // used for i8 quant
+            {
+                // softmax, find max
+                SoftmaxType old_max = row_max;
+                SoftmaxType cur_max = wave_reduce(s_softmax, f_max);
+                cur_max = cross_wave_reduce(cur_max, f_max, reinterpret_cast<SoftmaxType*>(smem));
+                row_max = max(old_max, cur_max); // update row_max
+                // softmax, exp(i_elem - max)
+                SoftmaxType p_compute = __builtin_amdgcn_exp2f(s_softmax - row_max);
+                // compute exp_sum
+                SoftmaxType row_sum = wave_reduce(p_compute, f_sum);
+                row_sum = cross_wave_reduce(row_sum, f_sum, reinterpret_cast<SoftmaxType*>(smem));
+                // l, pre-scall o_acc
+                SoftmaxType tmp = __builtin_amdgcn_exp2f(old_max - row_max);
+                l               = tmp * l + row_sum;
+                o_acc           = type_convert<AccType>(type_convert<SoftmaxType>(o_acc) * tmp);
+                // prepare the p_compute into smem, to let every thread read same p_compute and do
+                // 2nd gemm
+                if constexpr(is_kvcache_i8_forward_quant)
+                {
+                    float v_s = 0;
+                    if(static_cast<int>(threadIdx.x) < args.hdim_v)
+                    {
+                        v_s = type_convert<float>(kvscale_addr.load(i_hk, threadIdx.x, 1));
+                    }
+                    // 1) we apply the v scale to p
+                    float p_forwarded = p_compute * v_s;
+                    // 2) apply smooth-quant
+                    // find absmax
+                    float pf_max = wave_reduce(p_forwarded, f_absmax_f32);
+                    pf_max =
+                        cross_wave_reduce(pf_max, f_absmax_f32, reinterpret_cast<float*>(smem));
+                    // per-token scale
+                    pf_scale = pf_max / 127.0;
+                    // devide by scale
+                    p_compute = p_compute / pf_scale;
+                    // fp32->i8
+                    int8_t quantized_p = static_cast<int8_t>(p_compute);
+                    __syncthreads();
+                    reinterpret_cast<int8_t*>(smem)[threadIdx.x] = quantized_p;
+                    __syncthreads();
+                    // after above process, we have 2 data
+                    // 1) int8 p data stored in smem(no need to reload)
+                    // 2) per-token scale pf_scale, to be mul after 2nd gemm
+                }
+                else
+                {
+                    __syncthreads();
+                    reinterpret_cast<PType*>(smem)[threadIdx.x] = type_convert<PType>(p_compute);
+                    __syncthreads();
+                }
+            }
+            // gemm-2, simple loop over vector by vector
+            constexpr int gemm_2_loop = wg_size / p_vec_elem;
+            {
+                AccType o_acc_local = {0};
+                int sk_start = i_loop1 * wg_size; // we start from the first seqlen_kv element
+                for(int i_loop2 = 0; i_loop2 < gemm_2_loop; i_loop2++)
+                {
+                    p_vec_type p_vec = reinterpret_cast<p_vec_type*>(smem)[i_loop2];
+#pragma unroll
+                    for(int i_j = 0; i_j < p_vec_elem; i_j++)
+                    {
+                        int sv_offset = i_loop2 * p_vec_elem + i_j;
+                        int i_sv      = sk_start + sv_offset;
+                        VType v = 0.f;
+                        if(i_dv < args.hdim_v && i_sv < seqlen_kv)
+                        {
+                            v = v_addr.load(i_sv, i_dv);
+                        }
+                        o_acc_local += type_convert<AccType>(p_vec[i_j]) * type_convert<AccType>(v);
+                    }
+                }
+                if constexpr(is_kvcache_i8_forward_quant)
+                {
+                    // apply pr scale to local acc
+                    o_acc_local =
+                        type_convert<AccType>(type_convert<float>(o_acc_local) * pf_scale);
+                }
+                o_acc += o_acc_local;
+            }
+        }
+        // post scale o_acc
+        {
+            SoftmaxType tmp = l == 0.f ? 0.f : 1.f / l; // in case masking
+            o_acc           = type_convert<AccType>(type_convert<SoftmaxType>(o_acc) * tmp);
+        }
+        // store O
+        if(i_dv < args.hdim_v)
+            o_addr.store(type_convert<OType>(o_acc), i_sq, i_dv);
+    }
+};
+#define CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_()                                                        \
+    {                                                                                                       \
+        using ktraits_ =                                                                                    \
+            naive_attention_fwd_kernel_traits<static_cast<naive_attention_variation_enum>(                  \
+                variation_)>;                                                                               \
+        using k_   = naive_attention_fwd_kernel<q_type_,                                                    \
+                                              k_type_,                                                    \
+                                              v_type_,                                                    \
+                                              o_type_,                                                    \
+                                              acc_type_,                                                  \
+                                              q_layout_,                                                  \
+                                              k_layout_,                                                  \
+                                              v_layout_,                                                  \
+                                              o_layout_,                                                  \
+                                              ktraits_>;                                                  \
+        dim3 grids = k_::get_grid_size(a);                                                                  \
+        r          = ck_tile::launch_kernel(s,                                                              \
+                                   ck_tile::make_kernel(k_{}, grids, k_::get_block_size(), 0, a)); \
+    }
+#define CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_()                                                 \
+    if(t.variation == 0 && t.q_layout == "bshd" && t.k_layout == "bshd" && t.v_layout == "bshd" && \
+       t.o_layout == "bshd")                                                                       \
+    {                                                                                              \
+        constexpr auto q_layout_ = naive_attention_layout_enum::BSHD;                              \
+        constexpr auto k_layout_ = naive_attention_layout_enum::BSHD;                              \
+        constexpr auto v_layout_ = naive_attention_layout_enum::BSHD;                              \
+        constexpr auto o_layout_ = naive_attention_layout_enum::BSHD;                              \
+        constexpr int variation_ = 0;                                                              \
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_();                                              \
+    }                                                                                              \
+    else if(t.variation == 0 && t.q_layout == "bhsd" && t.k_layout == "bhsd" &&                    \
+            t.v_layout == "bhsd" && t.o_layout == "bhsd")                                          \
+    {                                                                                              \
+        constexpr auto q_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr auto k_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr auto v_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr auto o_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr int variation_ = 0;                                                              \
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_();                                              \
+    }                                                                                              \
+    else if(t.variation == 2 && t.q_layout == "bhsd" && t.k_layout == "phdsx" &&                   \
+            t.v_layout == "phds" && t.o_layout == "bhsd")                                          \
+    {                                                                                              \
+        constexpr auto q_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr auto k_layout_ = naive_attention_layout_enum::PHDSX;                             \
+        constexpr auto v_layout_ = naive_attention_layout_enum::PHDS;                              \
+        constexpr auto o_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr int variation_ = 2;                                                              \
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_();                                              \
+    }
+//
+CK_TILE_HOST float naive_attention_fwd(naive_attention_fwd_traits t,
+                                       naive_attention_fwd_args a,
+                                       ck_tile::stream_config s)
+{
+    float r = -1;
+    // TODO: do not explicitly create too much instance!
+    if(t.q_type == "fp16" && t.k_type == "fp16" && t.v_type == "fp16" && t.o_type == "fp16")
+    {
+        using q_type_   = fp16_t;
+        using k_type_   = fp16_t;
+        using v_type_   = fp16_t;
+        using o_type_   = fp16_t;
+        using acc_type_ = float;
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
+    }
+    else if(t.q_type == "bf16" && t.k_type == "bf16" && t.v_type == "bf16" && t.o_type == "bf16")
+    {
+        using q_type_   = bf16_t;
+        using k_type_   = bf16_t;
+        using v_type_   = bf16_t;
+        using o_type_   = bf16_t;
+        using acc_type_ = float;
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
+    }
+    else if(t.q_type == "bf16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "bf16")
+    {
+        using q_type_   = bf16_t;
+        using k_type_   = int8_t;
+        using v_type_   = int8_t;
+        using o_type_   = bf16_t;
+        using acc_type_ = int32_t; // NOTE!
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
+    }
+    else if(t.q_type == "fp16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "fp16")
+    {
+        using q_type_   = fp16_t;
+        using k_type_   = int8_t;
+        using v_type_   = int8_t;
+        using o_type_   = fp16_t;
+        using acc_type_ = int32_t; // NOTE!
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
+    }
+    return r;
+}
+#undef CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_
+#undef CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_
+} // namespace ck_tile
--- a/include/ck_tile/remod.py
+++ b/include/ck_tile/remod.py
@@ -7,6 +7,7 @@ import copy
 NS = 'ck_tile'
 OPS = 'ops'
+REF = 'ref'
 OPS_COMMON = 'common' # common header will be duplicated into ops/* other module
 HEADER_COMMON = f"""// SPDX-License-Identifier: MIT
@@ -29,6 +30,9 @@ class submodule_t:
    def push(self, f):
        if len(f.parents) != 1: # ignore ./xxx.hpp
            mod = get_module(f)
+            # ref is supposed to include one header on demand
+            if mod == REF:
+                return
            if mod == OPS:
                if mod not in self.m.keys():
                    self.m[mod] = dict()