Merge branch 'amd-develop' into amd-master

72c9f129 · Jun Liu · 241c261f · ded0d83d · 72c9f129 · 72c9f129
Commit 72c9f129 authored Sep 20, 2024 by Jun Liu
20 changed files
--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
@@ -87,7 +87,11 @@ auto create_args(int argc, char* argv[])
        .insert("drop_offset", "0", "offset for random number generator")
        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
        .insert("warmup", "5", "number of iterations before benchmark the kernel")
-        .insert("repeat", "20", "number of iterations to benchmark the kernel");
+        .insert("repeat", "20", "number of iterations to benchmark the kernel")
+        .insert("deterministic",
+                "0",
+                "if set to 1 will use multi-buffer reduction strategy for dq, atomic opeartion "
+                "will not be used");

    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
@@ -128,11 +132,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::index_t hdim_v = arg_parser.get_int("d_v");
    if(hdim_v < 0)
        hdim_v = hdim_q;
-    if(hdim_q % 2 != 0 || hdim_v % 2 != 0)
-    {
-        std::cerr << "FMHA Bwd kernel currently only supports even headdim" << std::endl;
-        return false;
-    }

    bool i_perm = arg_parser.get_bool("iperm"); // if true, will be batch * nhead * seqlen * hdim
    bool o_perm = arg_parser.get_bool("operm"); // if false, will be batch * seqlen * nhead * hdim
@@ -180,6 +179,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    int stream_warmup  = arg_parser.get_int("warmup");
    int stream_repeat  = arg_parser.get_int("repeat");
    bool kname         = arg_parser.get_bool("kname");
+    bool deterministic = arg_parser.get_bool("deterministic");

    ck_tile::stream_config stream_config{nullptr,
                                         true,
@@ -265,6 +265,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
        (mode == mode_enum::batch ? seqlen_q : seqstart_q_host.back());
    const ck_tile::index_t shape_seqlen_k =
        (mode == mode_enum::batch ? seqlen_k : seqstart_k_host.back());
+    const ck_tile::index_t kN0 = (hdim_q <= 128) ? 128 : 64;
+    const ck_tile::index_t nsplits =
+        deterministic ? ck_tile::integer_divide_ceil(max_seqlen_k, kN0) : 1;

    ck_tile::HostTensor<QDataType> q_host(
        get_lengths(i_perm, shape_batch, nhead, shape_seqlen_q, hdim_q));
@@ -284,9 +287,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::HostTensor<ODataType> o_host(
        get_lengths(o_perm, shape_batch, nhead, shape_seqlen_q, hdim_v));
    ck_tile::HostTensor<LSEDataType> lse_host(
-        std::array<ck_tile::index_t, 3>{batch, nhead, max_seqlen_q});
+        std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q});
    ck_tile::HostTensor<DDataType> d_host(
-        std::array<ck_tile::index_t, 3>{batch, nhead, max_seqlen_q});
+        std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q});
    ck_tile::HostTensor<RandValOutputDataType> randval_host(
        p_drop > 0 ? get_lengths(true, shape_batch, nhead, shape_seqlen_q, max_seqlen_k)
                   : std::array<ck_tile::index_t, 4>{1, 1, 1, 1});
@@ -302,6 +305,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
        use_dbias
            ? get_lengths(i_perm, shape_batch, nhead, shape_seqlen_q, max_seqlen_k)
            : std::array<ck_tile::index_t, 4>{1, 1, 1, 1} /* dummy shape for simplifying code */);
+    ck_tile::HostTensor<AccDataType> dq_acc_host(
+        i_perm
+            ? std::array<ck_tile::index_t, 5>{nsplits, shape_batch, nhead, shape_seqlen_q, hdim_q}
+            : std::array<ck_tile::index_t, 5>{nsplits, shape_batch, shape_seqlen_q, nhead, hdim_q});

    if(init_method == 0)
    {
@@ -362,6 +369,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t));
    ck_tile::DeviceMem seqstart_k(seqstart_k_host.size() * sizeof(int32_t));
    ck_tile::DeviceMem alibi_slope_buf(alibi_slope_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem dq_acc_buf(dq_acc_host.get_element_space_size_in_bytes());

    q_buf.ToDevice(q_host.data());
    k_buf.ToDevice(k_host.data());
@@ -387,8 +395,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
    std::cout << "[" << prec << "|" << mode << "|" << io_layout(i_perm, o_perm) << "] b:" << batch
              << ", h:" << nhead << "/" << nhead_k << ", s:" << seqlen_q << "/" << seqlen_k
              << ", d:" << hdim_q << "/" << hdim_v << ", scale:" << scale << ", bias:" << bias
-              << ", dbias:" << use_dbias << ", p_drop:" << p_drop << ", mask:" << mask
-              << std::flush;
+              << ", dbias:" << use_dbias << ", p_drop:" << p_drop << ", s_randval:" << s_randval
+              << ", deterministic:" << deterministic << ", mask:" << mask << std::flush;
+
+    std::size_t workspace_size =
+        dq_acc_host.get_element_space_size_in_bytes() * sizeof(AccDataType) / (1024 * 1024);
+
+    if(deterministic == 1)
+    {
+        std::cout << "\nDeterministic mode ON: " << workspace_size
+                  << " MByte memory workspace allocated" << std::endl;
+    }

    auto fmha_traits = fmha_bwd_traits{hdim_q,
                                       hdim_v,
@@ -397,7 +414,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                       mask.type,
                                       bias.type,
                                       use_dbias,
-                                       p_drop > 0.0f};
+                                       p_drop > 0.0f,
+                                       s_randval,
+                                       deterministic};
    auto fmha_args   = [&]() {
        assert(nhead % nhead_k == 0);
        /// NOTE: we broadcast bias from [1, 1, seqlen_q, seqlen_k] to [batch, nhead, seqlen_q,
@@ -422,7 +441,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        const ck_tile::index_t nhead_stride_o       = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
        const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k);
        const ck_tile::index_t nhead_stride_do      = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
-        const ck_tile::index_t nhead_stride_lsed    = max_seqlen_q;
+        const ck_tile::index_t nhead_stride_lsed    = shape_seqlen_q;
        const ck_tile::index_t nhead_stride_dbias =
            (i_perm ? shape_seqlen_q * max_seqlen_k : max_seqlen_k);
        // setup batch_stride_* arguments
@@ -433,10 +452,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
        const ck_tile::index_t batch_stride_o       = (nhead * shape_seqlen_q * hdim_v);
        const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k);
        const ck_tile::index_t batch_stride_do      = (nhead * shape_seqlen_q * hdim_v);
-        const ck_tile::index_t batch_stride_lsed    = (nhead * max_seqlen_q);
+        const ck_tile::index_t batch_stride_lsed    = (nhead * shape_seqlen_q);
        const ck_tile::index_t batch_stride_dk      = (nhead * shape_seqlen_k * hdim_q);
        const ck_tile::index_t batch_stride_dv      = (nhead * shape_seqlen_k * hdim_v);
        const ck_tile::index_t batch_stride_dbias   = (nhead * shape_seqlen_q * max_seqlen_k);
+        const ck_tile::index_t split_stride_dq_acc =
+            (shape_batch * nhead * shape_seqlen_q * hdim_q);

        return fmha_bwd_args{q_buf.GetDeviceBuffer(),
                             k_buf.GetDeviceBuffer(),
@@ -452,6 +473,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             dk_buf.GetDeviceBuffer(),
                             dv_buf.GetDeviceBuffer(),
                             dbias_buf.GetDeviceBuffer(),
+                             dq_acc_buf.GetDeviceBuffer(),
                             seqstart_q.GetDeviceBuffer(),
                             seqstart_k.GetDeviceBuffer(),
                             nullptr,
@@ -473,6 +495,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             stride_o,
                             stride_randval,
                             stride_do,
+                             stride_q, // stride_dq_acc
+                             stride_q, // stride_dq
                             stride_dk,
                             stride_dv,
                             stride_dbias,
@@ -484,6 +508,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             nhead_stride_randval,
                             nhead_stride_do,
                             nhead_stride_lsed,
+                             nhead_stride_q, // nhead_stride_dq_acc
+                             nhead_stride_q, // nhead_stride_dq
+                             nhead_stride_k, // nhead_stride_dk
+                             nhead_stride_v, // nhead_stride_dv
                             nhead_stride_dbias,
                             batch_stride_q,
                             batch_stride_k,
@@ -493,15 +521,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             batch_stride_randval,
                             batch_stride_do,
                             batch_stride_lsed,
+                             batch_stride_q, // batch_stride_dq_acc
+                             batch_stride_q, // batch_stride_dq
                             batch_stride_dk,
                             batch_stride_dv,
                             batch_stride_dbias,
+                             split_stride_dq_acc,
                             mask.left,
                             mask.right,
                             static_cast<ck_tile::index_t>(mask.type),
                             p_drop,
                             p_undrop,
-                             s_randval,
                             {drop_seed, drop_offset}};
    }();

@@ -719,7 +749,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        if(o_perm) o_host_ref.ForEach([&](auto& self, auto idx) { o_host(b, idx[0], idx[1] + query_offset, idx[2]) = self(idx); });
        else       o_host_ref.ForEach([&](auto& self, auto idx) { o_host(b, idx[1] + query_offset, idx[0], idx[2]) = self(idx); });

-        lse_host_ref.ForEach([&](auto& self, auto idx) { lse_host(wb, idx[0], idx[1]) = self(idx); });
+        lse_host_ref.ForEach([&](auto& self, auto idx) { lse_host(b, idx[0], idx[1] + query_offset) = self(idx); });
        // clang-format on

        q_host_refs.push_back(q_host_ref);
@@ -738,6 +768,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    lse_buf.ToDevice(lse_host.data());
    dq_buf.SetZero();
    dbias_buf.SetZero();
+    dq_acc_buf.SetZero();

    ck_tile::stream_config stream_config_v{
        nullptr, true, 0, 0, 1, arg_parser.get_str("timer") == std::string("gpu")};

--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -77,6 +77,7 @@ struct fmha_bwd_args
    void* dk_ptr;
    void* dv_ptr;
    void* dbias_ptr;
+    void* dq_acc_ptr;
    const void* seqstart_q_ptr;
    const void* seqstart_k_ptr;
    const void* seqlen_k_ptr;
@@ -97,6 +98,8 @@ struct fmha_bwd_args
    ck_tile::index_t stride_o;
    ck_tile::index_t stride_randval;
    ck_tile::index_t stride_do;
+    ck_tile::index_t stride_dq_acc;
+    ck_tile::index_t stride_dq;
    ck_tile::index_t stride_dk;
    ck_tile::index_t stride_dv;
    ck_tile::index_t stride_dbias;
@@ -108,6 +111,10 @@ struct fmha_bwd_args
    ck_tile::index_t nhead_stride_randval;
    ck_tile::index_t nhead_stride_do;
    ck_tile::index_t nhead_stride_lsed;
+    ck_tile::index_t nhead_stride_dq_acc;
+    ck_tile::index_t nhead_stride_dq;
+    ck_tile::index_t nhead_stride_dk;
+    ck_tile::index_t nhead_stride_dv;
    ck_tile::index_t nhead_stride_dbias;
    ck_tile::index_t batch_stride_q;
    ck_tile::index_t batch_stride_k;
@@ -117,15 +124,17 @@ struct fmha_bwd_args
    ck_tile::index_t batch_stride_randval;
    ck_tile::index_t batch_stride_do;
    ck_tile::index_t batch_stride_lsed;
+    ck_tile::index_t batch_stride_dq_acc;
+    ck_tile::index_t batch_stride_dq;
    ck_tile::index_t batch_stride_dk;
    ck_tile::index_t batch_stride_dv;
    ck_tile::index_t batch_stride_dbias;
+    ck_tile::index_t split_stride_dq_acc;
    ck_tile::index_t window_size_left;
    ck_tile::index_t window_size_right;
    ck_tile::index_t mask_type;
    float p_drop;
    float p_undrop;
-    bool s_randval;
    std::tuple<uint64_t, uint64_t> drop_seed_offset;
 };

@@ -145,10 +154,10 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                  args.do_ptr,
                                                  args.d_ptr,
                                                  args.rand_val_ptr,
-                                                  args.dq_ptr,
                                                  args.dk_ptr,
                                                  args.dv_ptr,
                                                  args.dbias_ptr,
+                                                  args.dq_acc_ptr,
                                                  args.seqstart_q_ptr,
                                                  args.seqstart_k_ptr,
                                                  args.seqlen_k_ptr,
@@ -163,6 +172,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                  args.stride_bias,
                                                  args.stride_randval,
                                                  args.stride_do,
+                                                  args.stride_dq_acc,
                                                  args.stride_dk,
                                                  args.stride_dv,
                                                  args.stride_dbias,
@@ -173,13 +183,15 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                  args.nhead_stride_randval,
                                                  args.nhead_stride_do,
                                                  args.nhead_stride_lsed,
+                                                  args.nhead_stride_dq_acc,
+                                                  args.nhead_stride_dk,
+                                                  args.nhead_stride_dv,
                                                  args.nhead_stride_dbias,
-                                                  args.batch_stride_lsed,
+                                                  args.split_stride_dq_acc,
                                                  args.window_size_left,
                                                  args.window_size_right,
                                                  args.mask_type,
                                                  args.p_drop,
-                                                  args.s_randval,
                                                  args.drop_seed_offset);
        }
        else
@@ -192,10 +204,10 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                  args.do_ptr,
                                                  args.d_ptr,
                                                  args.rand_val_ptr,
-                                                  args.dq_ptr,
                                                  args.dk_ptr,
                                                  args.dv_ptr,
                                                  args.dbias_ptr,
+                                                  args.dq_acc_ptr,
                                                  args.seqlen_q,
                                                  args.seqlen_k,
                                                  args.hdim_q,
@@ -209,6 +221,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                  args.stride_bias,
                                                  args.stride_randval,
                                                  args.stride_do,
+                                                  args.stride_dq_acc,
                                                  args.stride_dk,
                                                  args.stride_dv,
                                                  args.stride_dbias,
@@ -219,6 +232,9 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                  args.nhead_stride_randval,
                                                  args.nhead_stride_do,
                                                  args.nhead_stride_lsed,
+                                                  args.nhead_stride_dq_acc,
+                                                  args.nhead_stride_dk,
+                                                  args.nhead_stride_dv,
                                                  args.nhead_stride_dbias,
                                                  args.batch_stride_q,
                                                  args.batch_stride_k,
@@ -227,14 +243,15 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                  args.batch_stride_randval,
                                                  args.batch_stride_do,
                                                  args.batch_stride_lsed,
+                                                  args.batch_stride_dq_acc,
                                                  args.batch_stride_dk,
                                                  args.batch_stride_dv,
                                                  args.batch_stride_dbias,
+                                                  args.split_stride_dq_acc,
                                                  args.window_size_left,
                                                  args.window_size_right,
                                                  args.mask_type,
                                                  args.p_drop,
-                                                  args.s_randval,
                                                  args.drop_seed_offset);
        }
    }();
@@ -260,8 +277,7 @@ auto fmha_bwd_dot_do_o_create_kargs_and_grids(fmha_bwd_args args)
                                                     args.stride_o,
                                                     args.nhead_stride_do,
                                                     args.nhead_stride_o,
-                                                     args.nhead_stride_lsed,
-                                                     args.batch_stride_lsed);
+                                                     args.nhead_stride_lsed);
        }
        else
        { // create batch mode kernel arguments
@@ -286,19 +302,59 @@ auto fmha_bwd_dot_do_o_create_kargs_and_grids(fmha_bwd_args args)
    return ck_tile::make_tuple(kargs, grids);
 }

+template <typename FmhaBwdConvertQGradKernel>
+auto fmha_bwd_convert_dq_create_kargs_and_grids(fmha_bwd_args args)
+{
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(FmhaBwdConvertQGradKernel::kIsGroupMode)
+        {
+            return FmhaBwdConvertQGradKernel::MakeKargs(args.dq_acc_ptr,
+                                                        args.dq_ptr,
+                                                        args.seqstart_q_ptr,
+                                                        args.seqstart_k_ptr,
+                                                        args.hdim_q,
+                                                        args.stride_dq,
+                                                        args.stride_dq_acc,
+                                                        args.nhead_stride_dq,
+                                                        args.nhead_stride_dq_acc,
+                                                        args.split_stride_dq_acc);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaBwdConvertQGradKernel::MakeKargs(args.dq_acc_ptr,
+                                                        args.dq_ptr,
+                                                        args.seqlen_q,
+                                                        args.seqlen_k,
+                                                        args.hdim_q,
+                                                        args.stride_dq,
+                                                        args.stride_dq_acc,
+                                                        args.nhead_stride_dq,
+                                                        args.nhead_stride_dq_acc,
+                                                        args.batch_stride_dq,
+                                                        args.batch_stride_dq_acc,
+                                                        args.split_stride_dq_acc);
+        }
+    }();
+
+    dim3 grids = FmhaBwdConvertQGradKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q);
+    return ck_tile::make_tuple(kargs, grids);
+}
+
 // this is used to pattern-match internl kernel implementation, not to instantiate kernel
 template <ck_tile::index_t HDim_,
          typename DataType_,
          bool kIsGroupMode_,
          ck_tile::BlockFmhaBwdPipelineEnum FmhaBwdPipelineEnum_,
          typename FmhaMask_,
+          typename FmhaDropout_,
          ck_tile::BlockAttentionBiasEnum BiasEnum_,
          bool kHasBiasGrad_,
-          bool kHasDropout_,
          bool kPadS_,
          bool kPadSK_,
          bool kPadD_,
-          bool kPadDv_>
+          bool kPadDv_,
+          bool kIsDeterministic_>
 struct fmha_bwd_dq_dk_dv_traits_
 {
    static constexpr ck_tile::index_t HDim    = HDim_;
@@ -306,13 +362,14 @@ struct fmha_bwd_dq_dk_dv_traits_
    static constexpr bool kIsGroupMode        = kIsGroupMode_;
    static constexpr auto FmhaBwdPipelineEnum = FmhaBwdPipelineEnum_;
    using FmhaMask                            = ck_tile::remove_cvref_t<FmhaMask_>;
+    using FmhaDropout                         = ck_tile::remove_cvref_t<FmhaDropout_>;
    static constexpr auto BiasEnum            = BiasEnum_;
    static constexpr bool kHasBiasGrad        = kHasBiasGrad_;
-    static constexpr bool kHasDropout         = kHasDropout_;
    static constexpr bool kPadS               = kPadS_;
    static constexpr bool kPadSK              = kPadSK_;
    static constexpr bool kPadD               = kPadD_;
    static constexpr bool kPadDv              = kPadDv_;
+    static constexpr bool kIsDeterministic    = kIsDeterministic_;
 };

 template <typename Traits_>
@@ -343,6 +400,31 @@ void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
 template <typename Traits_>
 std::string fmha_bwd_dot_do_o_get_name_();

+template <ck_tile::index_t HDim_,
+          typename DataType_,
+          bool kIsGroupMode_,
+          bool kPadS_,
+          bool kPadD_,
+          bool kIsDeterministic_>
+struct fmha_bwd_convert_dq_traits_
+{
+    static constexpr ck_tile::index_t HDim = HDim_;
+    using DataType                         = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode     = kIsGroupMode_;
+    static constexpr bool kPadS            = kPadS_;
+    static constexpr bool kPadD            = kPadD_;
+    static constexpr bool kIsDeterministic = kIsDeterministic_;
+};
+
+template <typename Traits_>
+float fmha_bwd_convert_dq_(const ck_tile::stream_config&, fmha_bwd_args);
+
+template <typename Traits_>
+void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
+
+template <typename Traits_>
+std::string fmha_bwd_convert_dq_get_name_();
+
 // This is the public API, will be generated by script
 struct fmha_bwd_traits
 {
@@ -354,6 +436,8 @@ struct fmha_bwd_traits
    bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
    bool has_dbias;
    bool has_dropout;
+    bool is_store_randval;
+    bool is_deterministic;
    // TODO: padding check is inside this api
 };
 float fmha_bwd(fmha_bwd_traits, fmha_bwd_args, const ck_tile::stream_config&);
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -5,25 +5,30 @@
 import argparse
 from enum import IntEnum
 from pathlib import Path
+import pkgutil
+import sys
 from typing import List, Optional

+import codegen.ops
 from codegen.cmake_config import *
-from codegen.ops import (
-    fmha_fwd,
-    fmha_fwd_splitkv,
-    fmha_bwd
-)


 class HandlerId(IntEnum):
    LIST_BLOBS = 0
    WRITE_BLOBS = 1

-handlers = {
-    'fwd'         : (fmha_fwd.list_blobs, fmha_fwd.write_blobs),
-    'fwd_splitkv' : (fmha_fwd_splitkv.list_blobs, fmha_fwd_splitkv.write_blobs),
-    'bwd'         : (fmha_bwd.list_blobs, fmha_bwd.write_blobs),
-}
+# inspect all modules under 'codegen.ops' and register API handlers 
+ops = []
+for importer, module_name, _ in pkgutil.iter_modules(codegen.ops.__path__):
+    full_module_name = '%s.%s' % (codegen.ops.__name__, module_name)
+    if full_module_name not in sys.modules:
+        ops.append(importer.find_spec(module_name).loader.load_module(module_name))
+unwanted_prefix = 'fmha_'
+handlers = dict(
+    [(op.__name__[len(unwanted_prefix):] if op.__name__.startswith(unwanted_prefix) else op.__name__,
+        (op.list_blobs, op.write_blobs)) for op in ops]
+)
+assert 0 < len(handlers)

 def write_blobs(output_dir: Optional[str], api_list : List[str], kernel_filter : Optional[str], receipt, mask_impl) -> None:
    if output_dir is None:

--- a/example/ck_tile/01_fmha/rotary.hpp
+++ b/example/ck_tile/01_fmha/rotary.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+#include <cassert>
+#include <cmath>
+#include <functional>
+#include <iterator>
+#include <optional>
+#include <random>
+#include <tuple>
+
+// keep sync with RotaryEmbeddingEnum
+enum class rope_enum
+{
+    none         = 0,
+    interleaved  = 1,
+    half_rotated = 2,
+};
+
+template <typename DataType>
+std::tuple<ck_tile::HostTensor<DataType>, ck_tile::HostTensor<DataType>>
+generate_rotary_cos_sin(ck_tile::index_t seqlen,
+                        ck_tile::index_t rotary_dim,
+                        std::optional<unsigned> seed = std::nullopt)
+{
+    // return dummy tensors if we won't apply RoPE at all
+    if(rotary_dim <= 0)
+    {
+        ck_tile::HostTensor<DataType> dummy({1, 1});
+        return std::make_tuple(dummy, dummy);
+    }
+
+    std::mt19937 random_engine(seed.has_value() ? *seed : std::random_device{}());
+    std::uniform_real_distribution<float> generator(0.0f, 1.0f);
+
+    const ck_tile::index_t num_rows = seqlen * 2;
+    const ck_tile::index_t num_cols = rotary_dim / 2;
+
+    using std::begin, std::end;
+
+    ck_tile::HostTensor<float> angle({num_rows, num_cols});
+    std::generate(begin(angle), end(angle), [&] { return generator(random_engine) * 2 * M_PI; });
+
+    ck_tile::HostTensor<DataType> cos({num_rows, num_cols});
+    std::transform(begin(angle), end(angle), begin(cos), [](float origin_value) {
+        return ck_tile::type_convert<DataType>(std::cos(origin_value));
+    });
+
+    ck_tile::HostTensor<DataType> sin({num_rows, num_cols});
+    std::transform(begin(angle), end(angle), begin(sin), [](float origin_value) {
+        return ck_tile::type_convert<DataType>(std::sin(origin_value));
+    });
+
+    return std::make_tuple(cos, sin);
+}
+
+template <typename DataType>
+std::tuple<ck_tile::HostTensor<DataType>, ck_tile::HostTensor<DataType>>
+slice_rotary_cos_sin(const ck_tile::HostTensor<DataType>& cos,
+                     const ck_tile::HostTensor<DataType>& sin,
+                     ck_tile::index_t seqlen_offset,
+                     ck_tile::index_t seqlen)
+{
+    assert(cos.get_num_of_dimension() == 2 && sin.get_num_of_dimension() == 2);
+    assert(cos.get_length(0) == sin.get_length(0) && cos.get_length(1) == sin.get_length(1));
+
+    assert(static_cast<std::size_t>(seqlen_offset + seqlen) <= cos.get_length(0));
+
+    const ck_tile::index_t num_rows = seqlen;
+    const ck_tile::index_t num_cols = cos.get_length(1);
+
+    ck_tile::HostTensor<DataType> cos_pt({num_rows, num_cols});
+    cos_pt.ForEach([&](auto& self, auto i) { self(i) = cos(i[0] + seqlen_offset, i[1]); });
+
+    ck_tile::HostTensor<DataType> sin_pt({num_rows, num_cols});
+    sin_pt.ForEach([&](auto& self, auto i) { self(i) = sin(i[0] + seqlen_offset, i[1]); });
+
+    return std::make_tuple(cos_pt, sin_pt);
+}
--- a/example/ck_tile/01_fmha/script/benchmark_bwd.sh
+++ b/example/ck_tile/01_fmha/script/benchmark_bwd.sh
 #!/bin/sh
-# TODO: run this script from CK root
-BUILD=build
-EXE=$BUILD/bin/tile_example_fmha_bwd
+# TODO: run this script from CK root or build directory
+EXE="$(find . -name tile_example_fmha_bwd -type f | head -n 1)"
 VALID=0

 for prec in "fp16" "bf16" ; do

--- a/example/ck_tile/01_fmha/script/benchmark_fwd.sh
+++ b/example/ck_tile/01_fmha/script/benchmark_fwd.sh
 #!/bin/sh
-# TODO: run this script from CK root
-BUILD=build
-EXE=$BUILD/bin/tile_example_fmha_fwd
+# TODO: run this script from CK root or build directory
+EXE="$(find . -name tile_example_fmha_fwd -type f | head -n 1)"
 VALID=0

 for prec in "fp16" "bf16" ; do

--- a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
 #!/bin/sh
-# TODO: run this script from CK root
-BUILD=build
-EXE=$BUILD/bin/tile_example_fmha_bwd
+# TODO: run this script from CK root or build directory
+EXE="$(find . -name tile_example_fmha_bwd -type f | head -n 1)"
 KNAME=1

 export CK_WARMUP=0
@@ -11,18 +10,19 @@ COMMON_ARGS='-v=1'
 set -x
 for prec in "fp16" "bf16" ; do
 for perm in 0 1 ; do
-for hdim in 32 64 128 ; do
+for hdim in 32 64 128 256 ; do
 for mode in 0 1 ; do
-for bias in "n" "e" "a"; do
-for dbias in 0 1 ; do
-for p_drop in 0.0 0.2; do
+for bias in "n" "a" ; do
+for dbias in 0 ; do
+for p_drop in 0.0 0.2 ; do
+for deterministic in 0 ; do

-$EXE -prec=$prec -b=1 -h=4 -h_k=2 -d=$hdim -s=259 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-$EXE -prec=$prec -b=2 -h=2 -d=$hdim -s=516 -s_k=253 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-$EXE -prec=$prec -b=1 -h=4 -h_k=1 -d=$hdim -s=500 -s_k=251 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=1 -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-$EXE -prec=$prec -b=1 -h=2 -d=$hdim -s=900 -s_k=258 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=2 -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-$EXE -prec=$prec -b=2 -h=1 -d=$hdim -s=987 -s_k=219 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=t:128,30 -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-$EXE -prec=$prec -b=2 -h=3 -h_k=1 -d=$hdim -s=244 -s_k=499 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=b:4,35 -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=1 -h=4 -h_k=2 -d=$hdim -s=259 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=2 -h=2 -d=$hdim -s=516 -s_k=253 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=1 -h=4 -h_k=1 -d=$hdim -s=500 -s_k=251 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=1 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=1 -h=2 -d=$hdim -s=900 -s_k=258 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=2 -v=1 -deterministic=$deterministic -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=2 -h=1 -d=$hdim -s=987 -s_k=219 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=t:128,30 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=2 -h=3 -h_k=1 -d=$hdim -s=244 -s_k=499 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=b:4,35 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS

 done
 done
@@ -31,4 +31,5 @@ done
 done
 done
 done
+done
 set +x
--- a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
--- a/example/ck_tile/01_fmha/utils.hpp
+++ b/example/ck_tile/01_fmha/utils.hpp
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
--- a/include/ck/filesystem.hpp
+++ b/include/ck/filesystem.hpp
--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp