merge develop

7572a691 · coderfeli · 7796fc73 · 6b6fcd37 · 7572a691 · 7572a691
Commit 7572a691 authored Feb 15, 2025 by coderfeli
20 changed files
--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -15,8 +15,7 @@ This will result in an executable `build/bin/tile_example_fmha_fwd`
 ## kernel
 The kernel template is `fmha_fwd_kernel.hpp`, this is the grid-wise op in old ck_tile's terminology. We put it here purposely, to demonstrate one can construct a kernel by using various internal component from ck_tile. We may still have an implementation under ck_tile's include path (in the future) for the kernel template.
-There are 3 template parameters for this kernel template.
+There are 2 template parameters for this kernel template.
-* `TilePartitioner` is used to map the workgroup to corresponding tile, `fmha_fwd_tile_partitioner.hpp` in this folder served as this purpose.
 * `FmhaPipeline` is one of the block_tile_pipeline(under `include/ck_tile/tile_program/block_tile_pipeline`) which is a performance critical component. Indeed, we did a lot of optimization and trials to optimize the pipeline and may still workout more performance pipeline and update into that folder. People only need to replace this pipeline type and would be able to enjoy the benefit of different performant implementations (stay tuned for updated pipeline(s)).
 * `EpiloguePipeline` will modify and store out the result in the last phase. People usually will do lot of post-fusion at this stage, so we also abstract this concept. Currently we didn't do much thing at the epilogue stage but leave the room for future possible support.

--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -506,6 +506,14 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                    cond &= deterministic == "f"
                    if not cond:
                        continue
+            if receipt == 4:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= bias in ['no', 'bias']
+                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                    cond &= dpad == dvpad
+                    cond &= deterministic == "f"
+                    if not cond:
+                        continue
            api_pool.register_dq_dk_dv_traits(k.api_trait())
            gen.append(k)

--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -29,11 +29,6 @@ K0_MAX_SUBMAX_MAP = {
    256: 256
 }
-TILE_PARTITIONER_MAP = {
-    "shb" : "ck_tile::FmhaFwdTilePartitioner_SHB",
-    "hbs" : "ck_tile::FmhaFwdTilePartitioner_HBS",
-}
 FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
 // auto generated by generate.py
@@ -90,9 +85,7 @@ using fmha_epilogue_{F_idx} =
                                           {F_spad}, {F_dvpad}>>;
 using fmha_kernel_{F_idx} =
-    ck_tile::FmhaFwdKernel<{F_tile_partitioner}<fmha_shape_{F_idx}>,
+    ck_tile::FmhaFwdKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
-                  fmha_pipeline_{F_idx},
-                  fmha_epilogue_{F_idx}>;
 using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
                        {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
@@ -329,12 +322,6 @@ class FmhaFwdKernel:
    F_pipeline      : FmhaFwdPipeline
    mask_impl       : str
-    def get_tp(self) -> str:
-        if self.F_mode == 'group':
-            return 'hbs'
-        else:
-            return 'shb'
    @property
    def template(self) -> str:
        kernel_body = str()
@@ -374,13 +361,12 @@ class FmhaFwdKernel:
                F_pipeline_enum = PIPELINE_ENUM_MAP[self.F_pipeline.tag],
                F_mask          = get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
                F_mode          = MODE_MAP[self.F_mode],
-                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag],
+                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag])
-                F_tile_partitioner = TILE_PARTITIONER_MAP[self.get_tp()])
    @property
    def name(self) -> str:
        # TODO: we don't encode idx here
-        return f"fmha_fwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_{self.get_tp()}_" + \
+        return f"fmha_fwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + \
                self.F_tile.name + '_' + self.F_pipeline.name
    @property
@@ -501,13 +487,20 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
                if kernel_filter != None:
                    if not fnmatch.fnmatch(k.name, kernel_filter):
                        continue
-                if receipt == 2:
+                if receipt in (2, 3):
                    cond = dtype in ['fp16', 'bf16']
                    cond &= pipeline.F_vlayout == 'row'
                    cond &= pipeline.F_bias in ['no', 'alibi']
                    cond &= pipeline.F_squant == 'f'
                    if not cond:
                        continue
+                if receipt == 4:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_bias in ['no', 'bias']
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
                api_pool.register_traits(k.api_trait())
                gen.append(k)

--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
@@ -46,9 +46,7 @@ using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaFwdAppendKVPipelineProbl
 using fmha_pipeline_{F_idx} = ck_tile::BlockFmhaFwdAppendKVPipeline<
    fmha_pipeline_problem_{F_idx}>;
-using fmha_kernel_{F_idx} =
+using fmha_kernel_{F_idx} = ck_tile::FmhaFwdAppendKVKernel<fmha_pipeline_{F_idx}>;
-    ck_tile::FmhaFwdAppendKVKernel<ck_tile::FmhaFwdAppendKVTilePartitioner<{F_bs}, {F_bsk}, {F_bd}, {F_bdv}>,
-                  fmha_pipeline_{F_idx}>;
 using trait_{F_idx} = fmha_fwd_appendkv_traits_<{F_hdim}, {F_dtype}, {F_bs}, {F_bsk}, {F_bd}, {F_bdv}, {F_vlayout},
                        {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_rope}, {F_pagedkv}>;

--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -48,8 +48,8 @@ using fmha_dtype_{F_idx} = {F_dtype};
 using fmha_mask_{F_idx} = {F_mask};
 namespace {{
-template <bool kHasUnevenSplits>
+template <bool kHasUnevenSplits, bool kMergeNumHeadGroupsSeqLenQ = false>
-struct kernel_runner {{
+struct instance {{
 using fmha_block_tile = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>;
 using fmha_shape = ck_tile::TileFmhaShape<fmha_block_tile,
@@ -64,11 +64,12 @@ using fmha_trait = ck_tile::TileFmhaFwdSplitKVTraits<{F_spad},
                                                     {F_dpad},
                                                     {F_dvpad},
                                                     {F_bias},
-                                                     false,
+                                                     /*kHasBiasGrad=*/false,
                                                     {F_lse},
                                                     {F_squant},
                                                     {F_pagedkv},
                                                     kHasUnevenSplits,
+                                                     kMergeNumHeadGroupsSeqLenQ,
                                                     {F_occupancy}>;
 using fmha_pipeline_problem = ck_tile::BlockFmhaFwdSplitKVPipelineProblem<
@@ -96,9 +97,7 @@ using fmha_epilogue =
                                           {F_spad}, {F_dvpad}>>;
 using fmha_kernel =
-    ck_tile::FmhaFwdSplitKVKernel<ck_tile::FmhaFwdSplitKVTilePartitioner<fmha_shape>,
+    ck_tile::FmhaFwdSplitKVKernel<fmha_pipeline, fmha_epilogue>;
-                  fmha_pipeline,
-                  fmha_epilogue>;
 static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 {{
@@ -117,28 +116,50 @@ using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F
 #include <iostream>
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wtautological-compare"
+namespace {{
+template <bool kHasUnevenSplits>
+void run_instance(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a) {{
+    if constexpr ({F_hdim} == 128 && {F_bias} == ck_tile::BlockAttentionBiasEnum::NO_BIAS
+                  && (std::is_same_v<{F_mask}, ck_tile::SimplifiedGenericAttentionMask<false>>
+                      || std::is_same_v<{F_mask}, FmhaMasks::NoMask>)) {{
+        if (a.max_seqlen_q == 1 && a.nhead_k < a.nhead_q) {{
+            instance<kHasUnevenSplits, /*kMergeNumHeadGroupsSeqLenQ=*/true>::run(s, a);
+        }} else {{
+            instance<kHasUnevenSplits>::run(s, a);
+        }}
+    }} else {{
+        instance<kHasUnevenSplits>::run(s, a);
+    }}
+}}
+}} // anonymous namespace
+#pragma clang diagnostic pop
 template<>
 void fmha_fwd_splitkv_oneshot_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 {{
    if constexpr({F_mode} == false) {{ // batch mode
        // we don't check every seqlen_k values for kvcache
        if (a.seqlen_k_ptr != nullptr) {{
-            kernel_runner<true>::run(s, a);
+            run_instance</*kHasUnevenSplits=*/true>(s, a);
        // make sure F_bn0 is divisible by F_bk1
        }} else if (a.seqlen_k % (a.num_splits * {F_bn0}) == 0) {{
-            kernel_runner<false>::run(s, a);
+            run_instance</*kHasUnevenSplits=*/false>(s, a);
        }} else {{
-            kernel_runner<true>::run(s, a);
+            run_instance</*kHasUnevenSplits=*/true>(s, a);
        }}
    }} else {{
-        kernel_runner<true>::run(s, a);
+        run_instance</*kHasUnevenSplits=*/true>(s, a);
    }}
 }}
 template<>
 std::string fmha_fwd_splitkv_get_name_<trait_{F_idx}>()
 {{
-    using k_ = kernel_runner<true>::fmha_kernel; /// FIXME: choose real kernel type
+    using k_ = instance<true>::fmha_kernel; /// FIXME: choose real kernel type
    return k_::GetName();
 }}
 """
@@ -148,7 +169,7 @@ using fmha_dtype_{F_idx} = {F_dtype};
 namespace {{
 template <ck_tile::index_t kLogMaxSplits>
-struct kernel_runner {{
+struct instance {{
 using fmha_trait = ck_tile::TileFmhaFwdSplitKVCombineTraits<{F_spad},
                                                    {F_dvpad},
                                                    {F_lse},
@@ -176,11 +197,7 @@ using fmha_epilogue =
                                           false, false>>;
 using fmha_kernel =
-    ck_tile::FmhaFwdSplitKVCombineKernel<
+    ck_tile::FmhaFwdSplitKVCombineKernel<fmha_pipeline, fmha_epilogue>;
-        ck_tile::FmhaFwdSplitKVCombineTilePartitioner<
-            fmha_pipeline_problem::kM0, fmha_pipeline_problem::kN1>,
-        fmha_pipeline,
-        fmha_epilogue>;
 static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 {{
@@ -202,22 +219,22 @@ template<>
 void fmha_fwd_splitkv_combine_oneshot_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 {{
    if (a.num_splits <= 8) {{
-        kernel_runner<3>::run(s, a);
+        instance<3>::run(s, a);
    }} else if (a.num_splits <= 16) {{
-        kernel_runner<4>::run(s, a);
+        instance<4>::run(s, a);
    }} else if (a.num_splits <= 32) {{
-        kernel_runner<5>::run(s, a);
+        instance<5>::run(s, a);
    }} else if (a.num_splits <= 64) {{
-        kernel_runner<6>::run(s, a);
+        instance<6>::run(s, a);
    }} else if (a.num_splits <= 128) {{
-        kernel_runner<7>::run(s, a);
+        instance<7>::run(s, a);
    }}
 }}
 template<>
 std::string fmha_fwd_splitkv_combine_get_name_<trait_{F_idx}>()
 {{
-    using k_ = kernel_runner<6>::fmha_kernel; /// FIXME: choose real kernel type
+    using k_ = instance<6>::fmha_kernel; /// FIXME: choose real kernel type
    return k_::GetName();
 }}
 """

--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -1140,6 +1140,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        naive_t.v_layout   = i_perm == 1 ? "bhsd" : "bshd";
        naive_t.o_layout   = o_perm == 1 ? "bhsd" : "bshd";
        naive_t.variation  = 0; // TODO?
+        naive_t.quant_algo = 0;
        ck_tile::DeviceMem o_naive_buf(o_host.get_element_space_size_in_bytes());

--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -103,7 +103,8 @@ if __name__ == "__main__":
        required=False,
        help="codegen receipt. 0: generate only 8xhdim coverage\n"  + \
             "  1: generate more instance to cover all hdim\n"  + \
-             "  2: Only generate instance for Flash attention integration"
+             "  2: Only generate instance for Flash attention integration\n"  + \
+             "  4: Only generate instance for PyTorch integration"
    )
    args = parser.parse_args()

--- a/example/ck_tile/02_layernorm2d/CMakeLists.txt
+++ b/example/ck_tile/02_layernorm2d/CMakeLists.txt
--- a/example/ck_tile/02_layernorm2d/README.md
+++ b/example/ck_tile/02_layernorm2d/README.md
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp
--- a/example/ck_tile/02_layernorm2d/script/smoke_test.sh
+++ b/example/ck_tile/02_layernorm2d/script/smoke_test.sh
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
--- a/example/ck_tile/03_gemm/README.md
+++ b/example/ck_tile/03_gemm/README.md
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
--- a/example/ck_tile/03_gemm/gemm_basic.hpp
+++ b/example/ck_tile/03_gemm/gemm_basic.hpp
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
--- a/example/ck_tile/03_gemm/script/benchmark_basic.sh
+++ b/example/ck_tile/03_gemm/script/benchmark_basic.sh