Merge branch 'ck_tile/layernorm2d_fwd_optimize' into ck_tile/ln_add_cache_clear

0475a327 · dummycoderfe · c9b961ab · 27ff3dec · 0475a327 · 0475a327
Commit 0475a327 authored Nov 04, 2024 by dummycoderfe
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -137,7 +137,7 @@ if(GPU_TARGETS)
 else()
    set(USER_GPU_TARGETS 0)
 endif()
-find_package(hip)
+find_package(hip REQUIRED)
 # No assumption that HIP kernels are launched with uniform block size for backward compatibility
 # SWDEV-413293 and https://reviews.llvm.org/D155213
 math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}")
@@ -170,7 +170,10 @@ else()
        set(CK_GPU_TARGETS ${GPU_TARGETS})
    endif()
 endif()
+#if the user did not set GPU_TARGETS, delete whatever was set by HIP package
+if(NOT USER_GPU_TARGETS)
+    set(GPU_TARGETS "")
+endif()
 #make sure all the targets on the list are actually supported by the current compiler
 rocm_check_target_ids(SUPPORTED_GPU_TARGETS
        TARGETS ${CK_GPU_TARGETS})
@@ -187,6 +190,10 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1
    add_definitions(-DCK_USE_WMMA)
    set(CK_USE_WMMA "ON")
 endif()
+option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
+if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908"))
+    add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH)
+endif()
 # CK config file to record supported datatypes, etc.
 configure_file(include/ck/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/config.h)
@@ -314,7 +321,6 @@ link_libraries(${OpenMP_gomp_LIBRARY})
 link_libraries(${OpenMP_pthread_LIBRARY})
 ## HIP
-find_package(HIP REQUIRED)
 # Override HIP version in config.h, if necessary.
 # The variables set by find_package() can't be overwritten,
 # therefore let's use intermediate variables.

--- a/README.md
+++ b/README.md
@@ -137,12 +137,11 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
    You can find instructions for running ckProfiler in [profiler](/profiler).
-Note the `-j` option for building with multiple threads in parallel. This speeds up the build significantly.
+Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
+However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and
+crash. On average, you should expect each thread to use ~2Gb of RAM.
 Depending on the number of CPU cores and the amount of RAM on your system, you may want to
-limit the number of threads. For example, if you have a 128-core CPU and 64 Gb of RAM.
+limit the number of threads. For example, if you have a 128-core CPU and 128 Gb of RAM it's advisable to use `-j32`.
-By default, `-j` launches one thread per CPU core, which can cause the build to run out of memory and
-crash. In such cases, you can reduce the number of threads to 32 by using `-j32`.
 Additional cmake flags can be used to significantly speed-up the build:
@@ -154,6 +153,11 @@ Additional cmake flags can be used to significantly speed-up the build:
  `batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most
  other platforms have faster instances, such as `xdl` or `wmma`, available.
+* `CK_USE_FP8_ON_UNSUPPORTED_ARCH` (default is OFF) must be set to ON in order to build instances,
+  such as `gemm_universal` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not
+  have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on
+  architectures like the MI100/MI200 for the functional support only.
 ## Using sccache for building
 The default CK Docker images come with a pre-installed version of sccache, which supports clang

--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -21,6 +21,14 @@ DTYPE_BITS = {
    "bf8" : 8
 }
+K0_MAX_SUBMAX_MAP = {
+    32 : 32,
+    64 : 64,
+    96 : 128,
+    128: 128,
+    256: 256
+}
 TILE_PARTITIONER_MAP = {
    "shb" : "ck_tile::FmhaFwdTilePartitioner_SHB",
    "hbs" : "ck_tile::FmhaFwdTilePartitioner_HBS",
@@ -35,7 +43,7 @@ FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
 FMHA_FWD_KERNEL_BODY="""
 using fmha_dtype_{F_idx} = {F_dtype};
-using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}>;
+using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>;
 using fmha_warp_tile_{F_idx} = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>;
 using fmha_shape_{F_idx} = ck_tile::TileFmhaShape<fmha_block_tile_{F_idx},
@@ -87,7 +95,7 @@ using fmha_kernel_{F_idx} =
                  fmha_pipeline_{F_idx},
                  fmha_epilogue_{F_idx}>;
-using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout},
+using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
                        {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
 #include <iostream>
@@ -125,7 +133,7 @@ FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
 FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) &&
                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
-                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
                return fmha_fwd_<trait_>(s, a);
            }}
 """
@@ -142,7 +150,7 @@ class FmhaFwdApiTrait:
    bk0       : int  # tile size along qk gemm unroll
    bn1       : int  # tile size along v head_dim
    bk1       : int  # tile size along kv gemm unroll
-    bk0blen   : int
+    bk0max    : int
    vlayout   : str
    mask      : str
    bias      : str  #
@@ -156,7 +164,7 @@ class FmhaFwdApiTrait:
    @property
    def name(self) -> str:
-        return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0blen}-'+\
+        return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-'+\
                    f'{self.vlayout}-{self.mask}-{self.bias}-{self.lse}-{self.dropout}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}'
    @property
@@ -188,8 +196,9 @@ class FmhaFwdApiTrait:
            if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
            else :               assert False
        elif self.pipeline_tag in ['qr']:
-            if self.dpad == 't': return f'true /*a.hdim_q % {self.bk0blen} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
-            else :               return f'a.hdim_q % {self.bk0blen} == 0'
+            if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :               return f'a.hdim_q % {bk0submax} == 0'
        else:   assert False
    @property
@@ -199,8 +208,9 @@ class FmhaFwdApiTrait:
            if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
            else :                assert False
        elif self.pipeline_tag in ['qr']:
-            if self.dvpad == 't': return f'true /*a.hdim_v % {self.bk0blen} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
-            else :                return f'a.hdim_v % {self.bk0blen} == 0'
+            if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.hdim_v % {bk0submax} == 0'
        else:   assert False
 @dataclass
@@ -271,7 +281,7 @@ class FmhaFwdApiPool:
                                   F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout] ,
                                   F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
                                   F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0blen=trait.bk0blen,
+                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
                                   F_hdim=hdim, F_dtype=DTYPE_MAP[dtype])
                if_j = 'if' if j == 0 else 'else if'
                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
@@ -289,7 +299,7 @@ class FmhaFwdTileSize:
    F_bk0       : int  # tile size along qk gemm unroll
    F_bn1       : int  # tile size along v head_dim
    F_bk1       : int  # tile size along kv gemm unroll
-    F_bk0blen   : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
+    F_bk0max    : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
    F_rm0       : int  # number of warps for gemm0 along q seqlen
    F_rn0       : int  # number of warps for gemm0 along k seqlen 
    F_rk0       : int  # number of warps for gemm0 along head dim q (not used)
@@ -302,7 +312,7 @@ class FmhaFwdTileSize:
    F_occupancy : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
    @property
    def name(self) -> str:
-        return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0blen}" +\
+        return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" +\
        f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" +\
        f"_w{self.F_wm}x{self.F_wn}x{self.F_wk}" + ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
@@ -335,7 +345,7 @@ class FmhaFwdKernel:
                F_bk0           = self.F_tile.F_bk0,
                F_bn1           = self.F_tile.F_bn1,
                F_bk1           = self.F_tile.F_bk1,
-                F_bk0blen       = self.F_tile.F_bk0blen,
+                F_bk0max        = self.F_tile.F_bk0max,
                F_rm0           = self.F_tile.F_rm0,
                F_rn0           = self.F_tile.F_rn0,
                F_rk0           = self.F_tile.F_rk0,
@@ -382,7 +392,7 @@ class FmhaFwdKernel:
                bk0=self.F_tile.F_bk0,
                bn1=self.F_tile.F_bn1,
                bk1=self.F_tile.F_bk1,
-                bk0blen=self.F_tile.F_bk0blen,
+                bk0max=self.F_tile.F_bk0max,
                vlayout=self.F_pipeline.F_vlayout,
                mask=self.F_pipeline.F_mask,
                bias=self.F_pipeline.F_bias,
@@ -401,6 +411,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
        return {
            '32'  : FmhaFwdTileSize(128, 64, 16, 32, 32, 32,     2, 1, 1,  2, 1, 1,  32, 32, 16, -1),
            '64'  : FmhaFwdTileSize(128, 64, 32, 64, 32, 64,     4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
+            ## '96'  : FmhaFwdTileSize(128, 128, 32, 128, 32, 96,   4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
        }
@@ -510,4 +521,4 @@ def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_im
        _, kernels = get_fwd_blobs(kernel_filter, receipt, mask_impl)
        for kernel in kernels:
            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME) + "\n")
\ No newline at end of file
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -29,6 +29,14 @@ DTYPE_BITS = {
    "bf8" : 8
 }
+K0_MAX_SUBMAX_MAP = {
+    32 : 32,
+    64 : 64,
+    96 : 128,
+    128: 128,
+    256: 256
+}
 FMHA_FWD_SPLITKV_PIPELINE_MAP = {
    "qr" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS",
    "qr_async" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync",
@@ -41,7 +49,7 @@ using fmha_mask_{F_idx} = {F_mask};
 namespace {{
 template <bool kHasUnevenSplits>
 struct kernel_runner {{
-using fmha_block_tile = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}>;
+using fmha_block_tile = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>;
 using fmha_warp_tile = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>;
 using fmha_shape = ck_tile::TileFmhaShape<fmha_block_tile,
@@ -103,7 +111,7 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 }};
 }}
-using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout},
+using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
                        {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, 
                        {F_dvpad}>;
@@ -241,7 +249,7 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const
 FMHA_FWD_SPLITKV_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse}) && (t.do_fp8_static_quant == {F_squant}) &&
                        ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
-                using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
                using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>;
                return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
@@ -260,7 +268,7 @@ class FmhaFwdSplitKVApiTrait:
    bk0       : int  # tile size along qk gemm unroll
    bn1       : int  # tile size along v head_dim
    bk1       : int  # tile size along kv gemm unroll
-    bk0blen   : int
+    bk0max    : int
    vlayout   : str
    mask      : str
    bias      : str  #
@@ -270,11 +278,11 @@ class FmhaFwdSplitKVApiTrait:
    skpad     : str
    dpad      : str
    dvpad     : str
-    pagedkv : str
+    pagedkv   : str
    @property
    def name(self) -> str:
-        return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0blen}-'+\
+        return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-'+\
                    f'{self.vlayout}-{self.mask}-{self.bias}-{self.lse}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-'+\
                    f'{self.dvpad}-{self.pagedkv}'
@@ -307,8 +315,9 @@ class FmhaFwdSplitKVApiTrait:
            if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
            else :               assert False
        elif self.pipeline_tag in ['qr']:
-            if self.dpad == 't': return f'true /*a.hdim_q % {self.bk0blen} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
-            else :               return f'a.hdim_q % {self.bk0blen} == 0'
+            if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :               return f'a.hdim_q % {bk0submax} == 0'
        else:   assert False
    @property
@@ -318,8 +327,9 @@ class FmhaFwdSplitKVApiTrait:
            if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
            else :                assert False
        elif self.pipeline_tag in ['qr']:
-            if self.dvpad == 't': return f'true /*a.hdim_v % {self.bk0blen} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
-            else :                return f'a.hdim_v % {self.bk0blen} == 0'
+            if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.hdim_v % {bk0submax} == 0'
        else:   assert False
 @dataclass
@@ -414,7 +424,7 @@ class FmhaFwdSplitKVApiPool:
                                   F_lse=BOOL_MAP[trait.lse], F_squant=BOOL_MAP[trait.squant], F_pagedkv=BOOL_MAP[trait.pagedkv], 
                                   F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
                                   F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0blen=trait.bk0blen,
+                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
                                   F_hdim=hdim, F_dtype=DTYPE_MAP[dtype])
                if_j = 'if' if j == 0 else 'else if'
                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
@@ -458,7 +468,7 @@ class FmhaFwdSplitKVKernel:
                F_bk0           = self.F_tile.F_bk0,
                F_bn1           = self.F_tile.F_bn1,
                F_bk1           = self.F_tile.F_bk1,
-                F_bk0blen       = self.F_tile.F_bk0blen,
+                F_bk0max        = self.F_tile.F_bk0max,
                F_rm0           = self.F_tile.F_rm0,
                F_rn0           = self.F_tile.F_rn0,
                F_rk0           = self.F_tile.F_rk0,
@@ -504,7 +514,7 @@ class FmhaFwdSplitKVKernel:
                bk0=self.F_tile.F_bk0,
                bn1=self.F_tile.F_bn1,
                bk1=self.F_tile.F_bk1,
-                bk0blen=self.F_tile.F_bk0blen,
+                bk0max=self.F_tile.F_bk0max,
                vlayout=self.F_pipeline.F_vlayout,
                mask=self.F_pipeline.F_mask,
                bias=self.F_pipeline.F_bias,
@@ -559,6 +569,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
        return {
            '32'  : FmhaFwdTileSize(32, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  16, 16, 16, -1),
            '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
+            ## '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
            '128' : FmhaFwdTileSize(64, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
            '256' : FmhaFwdTileSize(64, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
        }
@@ -576,6 +587,7 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d
        return {
            '32'  : FmhaFwdSplitKVCombineTileSize(16, 16,  -1),
            '64'  : FmhaFwdSplitKVCombineTileSize(32, 32,  -1),
+            ## '96' : FmhaFwdSplitKVCombineTileSize(32, 64,  -1),
            '128' : FmhaFwdSplitKVCombineTileSize(32, 64,  -1),
            '256' : FmhaFwdSplitKVCombineTileSize(32, 128, -1),
    }
@@ -604,7 +616,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
        if dtype in ['fp16', 'bf16']:
            for mask, bias, lse, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]):
                # TODO: use async pipeline when compiler is more stable 
-                if hdim == 256 or hdim in [32, 64, 128]:
+                if hdim == 256 or hdim in [32, 64, 128]:         ### [32, 64, 96, 128]:
                # if True:
                    pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask))
                    pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask))
@@ -743,4 +755,4 @@ def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_im
        _, kernels = get_fwd_splitkv_blobs(kernel_filter, receipt, mask_impl)
        for kernel in kernels:
            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_SPLITKV_API_FILENAME) + "\n")
\ No newline at end of file
--- a/example/ck_tile/02_layernorm2d/CMakeLists.txt
+++ b/example/ck_tile/02_layernorm2d/CMakeLists.txt
+set(LAYERNORM2D_FWD_KNOWN_APIS "fwd;bwd")
+set(LAYERNORM2D_FWD_ENABLE_APIS  "fwd" CACHE STRING
+    "semicolon-separated list of APIs to generate (${LAYERNORM2D_FWD_KNOWN_APIS}) & link, or \"all\".")
+if(LAYERNORM2D_FWD_ENABLE_APIS  STREQUAL "all")
+  set(LAYERNORM2D_FWD_ENABLE_APIS  ${LAYERNORM2D_FWD_KNOWN_APIS})
+endif()
+# generate a list of kernels, but not actually emit files at config sta
+execute_process(
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api ${LAYERNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --list_blobs
+  RESULT_VARIABLE ret
+)
+if(ret AND NOT ret EQUAL 0)
+  message( FATAL_ERROR "Fail to generate kernels via Python. ${ret}")
+endif()
+file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/layernorm2d_fwd_blobs.txt LAYERNORM2D_FWD_GEN_BLOBS)
+add_custom_command(
+  OUTPUT ${LAYERNORM2D_FWD_GEN_BLOBS}
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api ${LAYERNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --gen_blobs
+)
 set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd")
-# not using add_example_executable() to add this target, since we don't want this to have
-# to be included in "make all/install/check"
 message("adding example ${EXAMPLE_LAYERNORM2D_FWD}")
-file(GLOB INSTANCE_SRCS instances/*.cpp)
 add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp)
 target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
-target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${INSTANCE_SRCS})
+target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS})
 set(EXAMPLE_LAYERNORM2D_FWD_COMPILE_OPTIONS)

--- a/example/ck_tile/02_layernorm2d/README.md
+++ b/example/ck_tile/02_layernorm2d/README.md
 # Layernorm2D forward
-This folder contains example for Layernorm2D forward using ck_tile tile-programming implementation.
+This folder contains example for Layernorm2D forward using `ck_tile` tile-programming implementation.
+# Implementation and feature support
+## welford online algorithm
+We use welfold algorithm to update `mean`/`variance` block by block. For `N <=4096` case we can compute `mean`/`var`/`normalization` within one loop, we call it `one-pass`. For large N case, it is hard to keep `mean`/`var` inside register/LDS and then computation `normalization`, so we need to load input twice, first time to compute `mean`/`var` block-by-block, then load input another time to compute the `normalization`. We call it `two-pass`.
+## mean/variance save
+In training case the mean/variance need to store out (TBD, not supported yet)
+## prenorm/postnorm
+![](misc/pnorm.png)
+since [prenorm/postnorm](https://arxiv.org/pdf/1906.01787) is quite common in LLM blocks, this example boosts this feature by kernel fusion. Note that `prenorm`/`postnorm` always need to do elementwise-add a `shortcut` before the actual layernorm computation, and optionally store out the result to global. You can use `-fadd=1` to test `pre-add+store`, or `-fadd=2` to test `pre-add` without store out (not codegen by default).
+## smooth-quant/dynamic-quant
+we support smooth/dynamic quantization for `int8` output, by setting `-fquant=1` and `-prec_o=int8`. In this case the output will doing a rowwise dynamic quantization like below. Note that smooth-quant require input a `(1*N)` size per-channel scale(in fp32 in our example, though this is customizable), then elememt-wise multiply the tensor for each row, then compute the rowwise dynamic quant. if set `-fquant=2` will have the input per-channel scale stage, only the dynamic quant. This case is supported in our kernel but by default not generated (TBD: add some filter in generate.py support on-demand codegen)
+![](misc/dquant.png)
+```
+# assume output int8, hidden_states is [m, n] shape and in fp16/bf16
+# [m, 1]
+per_token_amax, _ = torch.max(
+     input=torch.abs(hidden_states), 
+     dim=-1, 
+     keepdim=True
+)
+per_token_scale = per_token_amax.to(dtype=torch.float32) / 127.0
+# quant hidden_states
+hidden_states = (hidden_states / per_token_scale).to(dtype=torch.int8)
+return hidden_states, per_token_scale
+# hidden_states now is int8 will feed to next layer as intput
+# per_token_scale will be used as dequant factor later layer
+```
 ## build
 ```
@@ -15,8 +51,35 @@ This will result in an executable `build/bin/tile_example_layernorm2d_fwd`
 ```
 args:
          -m    m dimension (default:3328)
-          -n    m dimension (default:4096)
+          -n    n dimension (default:4096)
+     -stride    stride per row, if -1 then equal to n (default:-1)
          -e    epsilon (default:1e-5)
+    -save_mv    save mean/variance(invstd) or not. set to 1 in training case (default:0)
          -v    cpu validation or not (default:1)
-       -prec    precision (default:fp16)
+      -kname    print kernel name or not (default:1)
+     -prec_i    input precision (default:fp16)
+     -prec_o    output precision, set auto will be the same as input (default:auto)
+    -prec_sx    output quant scale type, set auto will be the same as input. used when fquant=1 (default:auto)
+    -prec_sy    output quant scale type, set auto will be the same as input. used when fquant=1 or 2 (default:auto)
+       -fadd    fused-add, 0:no fused add, 1:preadd+store, 2:preadd only (default:0)
+     -fquant    fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant (default:0)
+     -warmup    cold iter (default:5)
+     -repeat    hot iter (default:20)
+```
+## limitations
+Note that `fquant=2`, `fadd=2`, `prec_sx/prec_sy` other than `fp32` are not by default generated. Though our kernel template suppor this. (TBD: add some flag in generate.py) to generate those instance on demand. Beside, `N>8192` case will by default using two-pass pipeline, and `-fquant=1/2` are not supported yet. If need suport `N>8192` and `fused+residual+store`, you can use this example together with `12_smoothquant`, to construct layernorm+residual, and smoothquant, 2 kernels for this purpose.
+```
+# some case
+# standard fp16 layernorm 2d, m=10. n=1024
+./build/bin/tile_example_layernorm2d_fwd  -m=10 -n=1024
+# standard fp16 layernorm 2d, m=10. n=1024, fused-smooth-quant, output in int8
+./build/bin/tile_example_layernorm2d_fwd  -m=10 -n=1024 -prec_o=int8 -fquant=1
+# standard fp16 layernorm 2d, m=10. n=1024, fused-smooth-quant+fused-add-store, output in int8
+./build/bin/tile_example_layernorm2d_fwd  -m=10 -n=1024 -prec_o=int8 -fquant=1 -fadd=1
 ```
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+import argparse
+from enum import IntEnum
+from pathlib import Path
+import sys
+from typing import List, Optional, Any
+import functools
+import itertools
+import copy
+from dataclasses import dataclass
+def get_if_str(idx, total, lase_else = True):
+    if idx == 0:
+        return 'if'
+    elif idx < total - 1:
+        return 'else if'
+    else:
+        if lase_else:
+            return 'else'
+        else:
+            return 'else if'
+FUSED_ADD_ENUM_STR_MAP = [
+    'no',
+    'pras',      # pre-norm
+    'pra' ]      # post-norm
+FUSED_FUSED_SWEEP_STR_MAP = [
+    'no',
+    'dquant' ]
+DATA_TYPE_MAP = {'fp32' : 'float',
+                 'fp16' : 'ck_tile::fp16_t',
+                 'bf16' : 'ck_tile::bf16_t',
+                 'int8' : 'ck_tile::int8_t'}
+def BOOL_MAP(b_) -> str:
+    if b_:
+        return 'true'
+    else:
+        return 'false'
+class layernorm_fwd_codegen:
+    API_TRAITS_DEFINE = """
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <typename XDataType_,
+          typename YDataType_,
+          typename XScaleDataType_,
+          typename YScaleDataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveMeanInvStd_,
+          bool kTwoPass_,
+          ck_tile::index_t kFusedAdd_ = 0,
+          ck_tile::index_t kFusedQuant_ = 0>
+struct layernorm2d_fwd_traits_
+{
+    using XDataType = ck_tile::remove_cvref_t<XDataType_>;
+    using YDataType = ck_tile::remove_cvref_t<YDataType_>;
+    using XScaleDataType = ck_tile::remove_cvref_t<XScaleDataType_>;
+    using YScaleDataType = ck_tile::remove_cvref_t<YScaleDataType_>;
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr ck_tile::index_t total_warps =
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+    // num of warps along m
+    static constexpr ck_tile::index_t BlockWarps_M = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            return total_warps * (warpSize / ThreadPerBlock_N_);
+        }
+        else
+        {
+            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / warpSize);
+        }
+    }();
+    // num of warps along n
+    static constexpr ck_tile::index_t BlockWarps_N = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            return 1;
+        }
+        else
+        {
+            static_assert(ThreadPerBlock_N_ % warpSize == 0);
+            return ThreadPerBlock_N_ / warpSize;
+        }
+    }();
+    static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
+    static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
+    static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
+    static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
+    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
+    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
+    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
+    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
+    using Vector     = ck_tile::sequence<1, Vector_N_>;
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector, ThreadPerBlock_M_ * ThreadPerBlock_N_>;
+    static constexpr bool kPadN           = kPadN_;
+    static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
+    static constexpr bool kTwoPass        = kTwoPass_;
+    static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_;
+    static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_;
+};
+template <typename XDataType_,
+          typename YDataType_,
+          typename XScaleDataType_,
+          typename YScaleDataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveMeanInvStd_,
+          bool kTwoPass_,
+          int  kFusedAdd_,
+          int  kFusedQuant_>
+using traits_ = layernorm2d_fwd_traits_<XDataType_,
+                                       YDataType_,
+                                       XScaleDataType_,
+                                       YScaleDataType_,
+                                       Repeat_M_,
+                                       Repeat_N_,
+                                       ThreadPerBlock_M_,
+                                       ThreadPerBlock_N_,
+                                       Vector_N_,
+                                       kPadN_,
+                                       kSaveMeanInvStd_,
+                                       kTwoPass_,
+                                       kFusedAdd_,
+                                       kFusedQuant_>;
+"""
+    API_COMMON_HEADER = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <ck_tile/core.hpp>
+#include "layernorm2d_fwd.hpp"
+#include <ck_tile/ops/epilogue.hpp>
+#include <iostream>
+#pragma once
+using S = ck_tile::stream_config;
+using A = layernorm2d_fwd_args;
+{F_traits_define}
+template <typename Traits_>
+float layernorm2d_fwd_(const S& s, A a)
+{{
+    using XDataType = typename Traits_::XDataType;
+    using YDataType = typename Traits_::YDataType;
+    using XScaleDataType = typename Traits_::XScaleDataType;
+    using YScaleDataType = typename Traits_::YScaleDataType;
+    using ComputeDataType = typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::ComputeDataType;
+    using PipelineTraits = ck_tile::Layernorm2dFwdTraits<Traits_::kPadN,
+        Traits_::kSaveMeanInvStd,
+        Traits_::kTwoPass,
+        static_cast<ck_tile::Layernorm2dFusedAddEnum>(Traits_::kFusedAdd),
+        static_cast<ck_tile::Layernorm2dFusedQuantEnum>(Traits_::kFusedQuant)>;
+    using PipelineProblem = ck_tile::Layernorm2dFwdPipelineProblem<
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::XDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::GammaDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::BetaDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::ComputeDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::YDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::MeanDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::InvStdDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::XScaleDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::YScaleDataType,
+        typename Traits_::Shape,
+        PipelineTraits>;
+    using OnePassPipeline = ck_tile::Layernorm2dFwdPipelineOnePass<PipelineProblem>;
+    using TwoPassPipeline = ck_tile::Layernorm2dFwdPipelineTwoPass<PipelineProblem>;
+    using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
+    using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem<ComputeDataType, YDataType, false, Traits_::kPadN, false>;
+    using Default2DEpilogue = ck_tile::Default2DEpilogue<Default2DEpilogueProblem>;
+    static constexpr bool UseSmoothInputScale = Traits_::kFusedQuant == 1;
+    using DynamicQuantEpilogueProblem = ck_tile::DynamicQuantEpilogueProblem<ComputeDataType, XScaleDataType, YScaleDataType, YDataType, typename Traits_::Shape,
+            ck_tile::DynamicQuantEpilogueTraits<false, Traits_::kPadN, UseSmoothInputScale, false,  true/*max3*/>>;
+    using DynamicQuantEpilogue = ck_tile::DynamicQuantEpilogue<DynamicQuantEpilogueProblem>;
+    using Epilogue = std::conditional_t<Traits_::kFusedQuant == 1, DynamicQuantEpilogue,  Default2DEpilogue>;
+    using Kernel = ck_tile::Layernorm2dFwd<Pipeline, Epilogue>;
+    const dim3 grids                       = Kernel::GridSize(a);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+    auto kargs = Kernel::MakeKargs(a);
+    if(s.log_level_ > 0)
+        std::cout << ", " << Kernel::GetName() << std::flush;
+    return ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{{}}, grids, blocks, 0, kargs));
+}}
+"""
+    API_BASE = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <ck_tile/core.hpp>
+#include "layernorm2d_fwd.hpp"
+{F_traits_define}
+// Note: this internal API only declare, not define here, otherwise will block `make -j`
+template <typename Traits_>
+float layernorm2d_fwd_(const ck_tile::stream_config& s, layernorm2d_fwd_args a);
+float layernorm2d_fwd(layernorm2d_fwd_traits t,
+                      layernorm2d_fwd_args a,
+                      const ck_tile::stream_config& s)
+{{
+    float r = -1;
+{F_dispatch}
+    return r;
+}}
+"""
+    API_PER_DTYPE="""    {F_if}(t.prec_i == \"{F_i_type}\" && t.prec_o == \"{F_o_type}\"){{
+{F_per_n_case}
+    }}
+"""
+    API_PER_N_CASE="""        {F_if} {F_N_COND} {{
+{F_inner_dispatch}
+        }}
+"""
+    API_INNER_CASE="""            {F_if} {F_VEC_COND}
+                r={F_instance_func}(s, a);
+"""
+    INSTANCE_BASE = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "layernorm2d_fwd_api_common.hpp"
+// clang-format off
+//                                      prec_i           prec_o           prec_sy           rm  rn  tm    tn  vn  pd     mv     2p      add  sweep
+{F_instance_def}
+// clang-format on
+"""
+    def __init__(self, working_path, kernel_filter):
+        self.working_path = working_path
+        self.kernel_filter = kernel_filter
+    class k_fuesd_add_enum(IntEnum):
+        F_NO_ADD = 0
+        F_PRE_ADD = 1
+        F_PRE_ADD_STORE_RESIDUAL = 2
+    class k_fused_sweep_enum(IntEnum):
+        F_NO_SWEEP = 0
+        F_RENORM = 1
+        F_DYNAMIC_QUANT = 2
+    @dataclass
+    class k_traits:
+        F_kPadN : bool
+        F_kSaveMeanInvStd : bool
+        F_kTwoPass : bool
+        F_kFusedAdd : Any #: layernorm_fwd_codegen.k_fuesd_add_enum
+        F_kFusedQuant : Any  #: layernorm_fwd_codegen.k_fused_sweep_enum
+    @dataclass
+    class k_shape:
+        F_BlockTile    : List[int]
+        F_WarpPerBlock : List[int]
+        F_WarpTile     : List[int]
+        F_Vector_      : List[int]
+        @property
+        def F_BlockSize(self) -> int:
+            return functools.reduce(lambda a, b: a*b, self.F_WarpTile)
+    @dataclass
+    class k_problem:
+        F_XDataType       : str
+        F_GammaDataType   : str
+        F_BetaDataType    : str
+        F_ComputeDataType : str
+        F_YDataType       : str
+        F_MeanDataType    : str
+        F_InvStdDataType  : str
+        F_BlockShape      : str
+        F_Traits          : Any #k_traits
+    @dataclass
+    class k_pipeline_one_pass:
+        F_Problem         : Any #k_problem
+    @dataclass
+    class k_pipeline_two_pass:
+        F_Problem         : Any #k_problem
+    @dataclass
+    class default_2d_epilogue_problem:
+        F_AccDataType : str
+        F_ODataType : str
+        F_kPadM : bool
+        F_kPadN : bool
+    @dataclass
+    class default_2d_epilogue:
+        F_problem : Any
+    @dataclass
+    class k_kernel:
+        F_pipeline : Any
+        F_epilogue : Any
+    @dataclass
+    class h_traits:
+        F_XDataType : str
+        F_YDataType : str
+        F_XScaleDataType : str
+        F_YScaleDataType : str
+        F_Repeat_M : int
+        F_Repeat_N : int
+        F_ThreadPerBlock_M : int
+        F_ThreadPerBlock_N : int
+        F_Vector_N : int
+        F_kPadN : bool
+        F_kSaveMeanInvStd_ : bool
+        F_kTwoPass_ : bool
+        F_kFusedAdd : int
+        F_kFusedQuant : int
+        @property
+        def trait_name(self) ->str:
+            t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_XScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}'
+            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}'
+            t_ += f', {BOOL_MAP(self.F_kTwoPass_):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}'
+            return t_
+        # string when calling this kernel
+        @property
+        def call_name(self) -> str:
+            return f'layernorm2d_fwd_<traits_<{self.trait_name}>>'
+        # string when define this kernel
+        @property
+        def def_name(self) -> str:
+            return f'template float layernorm2d_fwd_<traits_<{self.trait_name}>>(const S&, A);'
+    # this class hold kernel under same source file
+    @dataclass
+    class h_instance:
+        F_DataTypePair : str
+        F_N : str
+        F_add : int
+        F_sweep : int
+        instance_list : List[Any] # List[h_traits]
+        @property
+        def name(self) -> str:
+            prec_i, prec_o = self.F_DataTypePair.split(',')
+            dtype_str = f'{prec_i}' if prec_i == prec_o else f'{prec_i}_{prec_o}'
+            nnn = f'layernorm2d_fwd_{dtype_str}_n{self.F_N}'
+            if self.F_add != 0:
+                nnn = nnn + '_' + FUSED_ADD_ENUM_STR_MAP[self.F_add]
+            if self.F_sweep != 0:
+                nnn = nnn + '_' + FUSED_FUSED_SWEEP_STR_MAP[self.F_sweep]
+            return nnn
+        @property
+        def instance_name(self) ->str:
+            return self.name
+        @property
+        def content(self) ->str:
+            instance_defs = ''
+            for ins in self.instance_list:
+                instance_defs += ins.def_name + '\n'
+            return layernorm_fwd_codegen.INSTANCE_BASE.format(F_instance_def=instance_defs)
+    @property
+    def name_api(self) -> str:
+        return 'layernorm2d_fwd_api'
+    @property
+    def name_common_header(self) -> str:
+        return 'layernorm2d_fwd_api_common'
+    @property
+    def content_api(self) -> str:
+        # 1 sort based on dtype
+        t_dtype_dict = dict()
+        blobs = self.get_blobs()
+        for blob in blobs:
+            if blob.F_DataTypePair not in t_dtype_dict:
+                t_dtype_dict[blob.F_DataTypePair] = {}
+            if blob.F_N not in t_dtype_dict[blob.F_DataTypePair]:
+                t_dtype_dict[blob.F_DataTypePair][blob.F_N] = []
+            t_dtype_dict[blob.F_DataTypePair][blob.F_N].append(blob)
+        d_str = ''
+        for i_d, dtype_ in enumerate(t_dtype_dict):
+            blob_per_t = t_dtype_dict[dtype_]
+            n_str = ''
+            for i_n, n_ in enumerate(blob_per_t):
+                blob_per_n = blob_per_t[n_]
+                inner_str = ""
+                for i_b, b_ in enumerate(blob_per_n):
+                    # generate single kernel instance file
+                    #vec_str = ""
+                    for i_ins, ins in enumerate(b_.instance_list):
+                        idx_in_n = i_b * len(b_.instance_list) + i_ins
+                        len_in_n = len(blob_per_n) * len(b_.instance_list)
+                        # _if = 'if' if i_ins == 0 else 'else if'
+                        if ins.F_kFusedQuant == 0:
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep}'.format(f_fused_sweep = ins.F_kFusedQuant)
+                        elif ins.F_kFusedQuant == 1:
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sx == \"{f_sx_type}\" && t.prec_sy == \"{f_sy_type}\")'.format(
+                                f_fused_sweep = ins.F_kFusedQuant, f_sx_type=ins.F_XScaleDataType, f_sy_type=ins.F_YScaleDataType)
+                        elif ins.F_kFusedQuant == 2:
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sy == \"{f_sy_type}\")'.format(
+                                f_fused_sweep = ins.F_kFusedQuant, f_sy_type=ins.F_YScaleDataType)
+                        _cond = '((a.n % {f_vec_n} == 0) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}))'.format(
+                                        f_vec_n = ins.F_Vector_N, f_fused_add = ins.F_kFusedAdd,
+                                        f_sweep_cond = _sweep_cond)
+                        inner_str += self.API_INNER_CASE.format(F_if = get_if_str(idx_in_n, len_in_n, False),
+                                            F_VEC_COND = _cond, F_instance_func=ins.call_name)
+                    #inner_str = inner_str + vec_str
+                n_cnd = f'(a.n <= {n_})' if (i_n < len(blob_per_t) - 1) else ''
+                n_str += self.API_PER_N_CASE.format(F_if = get_if_str(i_n, len(blob_per_t)), F_N_COND=n_cnd, F_inner_dispatch=inner_str)
+            prec_i, prec_o = dtype_.split(',')
+            d_str += self.API_PER_DTYPE.format(F_if = get_if_str(i_d, len(t_dtype_dict), False), F_i_type=prec_i, F_o_type=prec_o, F_per_n_case=n_str)
+        api_base = self.API_BASE.format(F_traits_define=self.API_TRAITS_DEFINE, F_dispatch=d_str)
+        return api_base
+    @property
+    def content_common_header(self) -> str:
+        return self.API_COMMON_HEADER.format(F_traits_define=self.API_TRAITS_DEFINE)
+    def get_blobs(self):
+        h_traits = layernorm_fwd_codegen.h_traits
+        h_instance = layernorm_fwd_codegen.h_instance
+        dynamic_quant_out_dtype = ['int8']
+        # some predefined support range
+        # (prec_i,prec_o) for simplicity this string will be used as key for dict
+        scale_list = [('fp32,fp32')]
+        dtype_list = [('fp16,fp16'), ('bf16,bf16'),
+                        ('fp16,int8'), ('bf16,int8')] # NOTE: only fused-dynamic-quant use int8 out
+        #fused_add_list = [0, 1, 2]
+        #fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused dynamic quant
+        fused_add_list = [0, 1]
+        fused_sweep_list = [0, 1] # NOTE: only single pass can use fused dynamic quant
+        #                                                       rm  rn  tm   tn  vn  pd     mv     2p     add    sweep
+        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  8,  8,  8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 1,  True,  False, False,   0,    0)],
+                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 1,  True,  False, False,   0,    0)],
+                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 1,  True,  False, False,   0,    0)],
+                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  4,  64, 1,  True,  False, False,   0,    0)],
+                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  4,  64, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  4,  64, 1,  True,  False, False,   0,    0)],
+                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  2, 128, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  2, 128, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  2, 128, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 1,  True,  False, False,   0,    0)],
+                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  2, 128, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 1,  True,  False, False,   0,    0)],
+                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1, 256, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1, 256, 1,  True,  False, False,   0,    0)],
+                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 128, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 1,  True,  False, False,   0,    0)],
+                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, False,   0,    0)],
+                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 512, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1,1024, 1,  True,  False, False,   0,    0)],
+                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 512, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1,1024, 1,  True,  False, False,   0,    0)],
+                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False,  True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False,  True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False,  True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False,  True,   0,    0)]}
+        total_blob = list()
+        for hs_key in h_trait_dict:
+            hs = h_trait_dict[hs_key]
+            current_n = hs[0].F_Repeat_N * hs[0].F_ThreadPerBlock_N * hs[0].F_Vector_N
+            for dtype, scale_type, fused_add, fused_quant in itertools.product(dtype_list, scale_list, fused_add_list, fused_sweep_list):
+                prec_i, prec_o = dtype.split(',')
+                scale_x, scale_y = scale_type.split(',')
+                if prec_o in dynamic_quant_out_dtype and fused_quant != 1:
+                    continue # skip non dynamic quant case
+                if fused_quant == 1 and hs_key == 'big':
+                    continue
+                current_hs = list()
+                for chs_ in hs:
+                    h_ = copy.copy(chs_) # copy the base instance out
+                    h_.F_XDataType = prec_i
+                    h_.F_YDataType = prec_o
+                    h_.F_XScaleDataType = scale_y
+                    h_.F_YScaleDataType = scale_x
+                    h_.F_kFusedAdd = fused_add
+                    h_.F_kFusedQuant = fused_quant
+                    current_hs.append(h_) # + "\n"
+                #f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_
+                current_n_str = 'big' if hs_key == 'big' else current_n
+                total_blob.append(h_instance(dtype, current_n_str, fused_add, fused_quant, current_hs))
+        return total_blob
+    def list_blobs(self) -> None:
+        w_p = Path(self.working_path)
+        list_p = w_p / 'layernorm2d_fwd_blobs.txt'
+        blobs = self.get_blobs()
+        with list_p.open('a') as list_f:
+            # api related file
+            list_f.write(str(w_p / (self.name_api + ".cpp"))  + "\n")
+            list_f.write(str(w_p / (self.name_common_header + ".hpp"))  + "\n")
+            # kernel instance file
+            for b in blobs:
+                list_f.write(str(w_p / (b.name + ".cpp")) + "\n")
+    def gen_blobs(self) -> None:
+        w_p = Path(self.working_path)
+        (w_p / (self.name_api + ".cpp")).write_text(self.content_api)
+        (w_p / (self.name_common_header + ".hpp")).write_text(self.content_common_header)
+        blobs = self.get_blobs()
+        for b in blobs:
+            (w_p / (b.name + ".cpp")).write_text(b.content)
+def list_blobs(args):
+    api_list = args.api.split(',')
+    for api in api_list:
+        if api == 'fwd':
+            layernorm_fwd_codegen(args.working_path, args.filter).list_blobs()
+def gen_blobs(args):
+    api_list = args.api.split(',')
+    for api in api_list:
+        if api == 'fwd':
+            layernorm_fwd_codegen(args.working_path, args.filter).gen_blobs()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate",
+        description="gen API for CK layernorm kernel",
+    )
+    parser.add_argument(
+        "-a",
+        "--api",
+        default='fwd[all]',
+        required=False,
+        help="supply API(s) to generate (default: fwd). separated by comma."
+    )
+    # the directory for list_blobs/gen_blobs to write files into
+    parser.add_argument(
+        "-w",
+        "--working_path",
+        default="./",
+        required=False,
+        help="the path where all the blobs are going to be generated"
+    )
+    # this script have 2 modes
+    # 1) list_blobs mode, will generate a txt file with all the files going to be generated.
+    #    this is useful in build system like cmake to construct source code dependency, by
+    #    reading the content out of this file
+    # 2) gen_blobs mode, will generate the actuall kernel instance and api. If in framework
+    #    like FA, only need to use this mode
+    parser.add_argument(
+        "-l",
+        "--list_blobs",
+        action='store_true',
+        help="list all the kernels to a file, "
+    )
+    parser.add_argument(
+        "-g",
+        "--gen_blobs",
+        action='store_true',
+        help="generate all kernels into different tile"
+    )
+    # TODO: if using filter, must apply same value to output_dir and list_blobs
+    parser.add_argument(
+        "-f",
+        "--filter",
+        required=False,
+        help="filter out kernels that need to generate, using fnmatch module"
+    )
+    parser.add_argument(
+        "-t",
+        "--traits",
+        default="all",
+        required=False,
+        help="enable/disable some feature. default generate all"
+    )
+    parser.add_argument(
+        "-r",
+        "--receipt",
+        default=0,
+        required=False,
+        help="codegen receipt."
+    )
+    args = parser.parse_args()
+    # print(f'{args.list_blobs}-{args.gen_blobs}')
+    if (args.gen_blobs and args.list_blobs) or ((not args.gen_blobs) and (not args.list_blobs)):
+        print('gen_blobs/list_blobs must specify only one option')
+        sys.exit()
+    p = Path(args.working_path)
+    if not p.exists():
+        p.mkdir()
+    if args.list_blobs:
+        list_blobs(args)
+    else:
+        gen_blobs(args)
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "layernorm2d_fwd_instance_common.hpp"
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-#if 0
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true , false, false>>(const S&, A);
-#endif
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 1, 2,  128, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 2,  128, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 2,  128, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 1,  true,  false, false>>(const S&, A);
-// clang-format on
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "layernorm2d_fwd_instance_common.hpp"
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 4,   64, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 2,  128, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1,  256, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 6, 1,  256, 1,  true,  false, false>>(const S&, A);
-// clang-format on
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "layernorm2d_fwd_instance_common.hpp"
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 1, 1,  256, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 8, 1,  256, 1,  true,  false, false>>(const S&, A);
-// clang-format on
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "layernorm2d_fwd_instance_common.hpp"
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1,  128, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1,  256, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 6, 1,  256, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1, 1024, 1,  true,  false, false>>(const S&, A);
-// clang-format on
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "layernorm2d_fwd_instance_common.hpp"
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true,  false, false>>(const S&, A);
-// clang-format on
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "layernorm2d_fwd_instance_common.hpp"
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true,  false, true>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true,  false, true>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true,  false, true>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true,  false, true>>(const S&, A);
-// clang-format on
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "layernorm2d_fwd_instance_common.hpp"
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 8,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 4,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 2,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 1,  true , false, false>>(const S&, A);
-// clang-format on
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "layernorm2d_fwd_instance_common.hpp"
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-#if 0
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true , false, false>>(const S&, A);
-#endif
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 1, 2,  128, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 2,  128, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 2,  128, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 1,  true,  false, false>>(const S&, A);
-// clang-format on
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "layernorm2d_fwd_instance_common.hpp"
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 4,   64, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 2,  128, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1,  256, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 6, 1,  256, 1,  true,  false, false>>(const S&, A);
-// clang-format on
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "layernorm2d_fwd_instance_common.hpp"
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 1, 1,  256, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 8, 1,  256, 1,  true,  false, false>>(const S&, A);
-// clang-format on
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "layernorm2d_fwd_instance_common.hpp"
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1,  128, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1,  256, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 6, 1,  256, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1, 1024, 1,  true,  false, false>>(const S&, A);
-// clang-format on
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "layernorm2d_fwd_instance_common.hpp"
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true,  false, false>>(const S&, A);
-// clang-format on
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "layernorm2d_fwd_instance_common.hpp"
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true,  false, true>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true,  false, true>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true,  false, true>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true,  false, true>>(const S&, A);
-// clang-format on