Enable sbhd for customer case v3 bwd debug

7cccb0a8 · Ye Wang · b39eecf1 · 7cccb0a8 · 7cccb0a8
Commit 7cccb0a8 authored Dec 14, 2024 by Ye Wang
Show whitespace changes
Inline Side-by-side

Showing with 36 additions and 34 deletions

example/ck_tile/01_fmha/CMakeLists.txt example/ck_tile/01_fmha/CMakeLists.txt +2 -2

example/ck_tile/01_fmha/fmha_bwd.cpp example/ck_tile/01_fmha/fmha_bwd.cpp +34 -32

No files found.
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -83,11 +83,11 @@ set(EXAMPLE_FMHA_BWD_COMPILE_OPTIONS)
 # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
 #       ... because they are auto-generated
 if(FMHA_FWD_FAST_EXP2)
-	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
+	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3 -fgpu-flush-denormals-to-zero)
 else()
 	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
 endif()
-list(APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-undefined-func-template -fgpu-flush-denormals-to-zero)
+list(APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3 -Wno-undefined-func-template -fgpu-flush-denormals-to-zero)

 # conditionally enable call to the fwd_splitkv API in fmha_fwd example
 if("fwd_splitkv" IN_LIST FMHA_FWD_ENABLE_APIS)

--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
@@ -55,7 +55,7 @@ auto create_args(int argc, char* argv[])
        .insert("iperm",
                "1",
                "permute input\n"
-                "if true, will be b*h*s*d, else b*s*h*d")
+                "if true, will be s*b*h*d, else b*s*h*d")
        .insert("operm", "1", "permute output")
        .insert("bias",
                "n",
@@ -160,7 +160,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    if(hdim_v < 0)
        hdim_v = hdim_q;

-    bool i_perm = arg_parser.get_bool("iperm"); // if true, will be batch * nhead * seqlen * hdim
+    bool i_perm = arg_parser.get_bool("iperm"); // if true, will be seqlen* batch * nhead * hdim
    bool o_perm = arg_parser.get_bool("operm"); // if false, will be batch * seqlen * nhead * hdim

    float scale = arg_parser.get_float("scale");
@@ -287,7 +287,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                           ck_tile::index_t s /*seqlen*/,
                           ck_tile::index_t d /*hdim*/) {
        if(permute)
-            return std::array<ck_tile::index_t, 4>{b, h, s, d};
+            return std::array<ck_tile::index_t, 4>{s, b, h, d};
        else
            return std::array<ck_tile::index_t, 4>{b, s, h, d};
    };
@@ -417,7 +417,7 @@ bool run(const ck_tile::ArgParser& arg_parser)

    // clang-format off
    auto layout_str = [&](bool permute){
-        if (permute) return std::string("bhsd");
+        if (permute) return std::string("sbhd");
        else return std::string("bshd");
    };
    auto io_layout = [&](bool iperm_, bool operm_) {
@@ -462,41 +462,43 @@ bool run(const ck_tile::ArgParser& arg_parser)
        ///       seqlen_k] in this example, hence both the 'batch_stride_bias' &
        ///       'nhead_stride_bias' are 0.
        // setup stride_* arguments
-        const ck_tile::index_t stride_q       = (i_perm ? hdim_q : nhead * hdim_q);
-        const ck_tile::index_t stride_k       = (i_perm ? hdim_q : nhead_k * hdim_q);
-        const ck_tile::index_t stride_v       = (i_perm ? hdim_v : nhead_k * hdim_v);
+        // bshd vs sbhd (perm)
+        const ck_tile::index_t stride_q       = (i_perm ? batch * nhead * hdim_q : nhead * hdim_q);
+        const ck_tile::index_t stride_k       = (i_perm ? batch * nhead_k* hdim_q : nhead_k * hdim_q);
+        const ck_tile::index_t stride_v       = (i_perm ? batch * nhead_k * hdim_v : nhead_k * hdim_v);
        const ck_tile::index_t stride_bias    = (max_seqlen_k);
-        const ck_tile::index_t stride_o       = (o_perm ? hdim_v : nhead * hdim_v);
+        const ck_tile::index_t stride_o       = (o_perm ? batch * nhead * hdim_v : nhead * hdim_v);
        const ck_tile::index_t stride_randval = (max_seqlen_k);
-        const ck_tile::index_t stride_do      = (o_perm ? hdim_v : nhead * hdim_v);
+        const ck_tile::index_t stride_do      = (o_perm ? batch * nhead * hdim_v : nhead * hdim_v);
        const ck_tile::index_t stride_dq_acc  = hdim_q;
-        const ck_tile::index_t stride_dk      = (i_perm ? hdim_q : nhead * hdim_q);
-        const ck_tile::index_t stride_dv      = (i_perm ? hdim_v : nhead * hdim_v);
+        const ck_tile::index_t stride_dk      = (i_perm ? batch * nhead* hdim_q : nhead * hdim_q);
+        const ck_tile::index_t stride_dv      = (i_perm ? batch * nhead* hdim_v : nhead * hdim_v);
        const ck_tile::index_t stride_dbias   = (i_perm ? max_seqlen_k : nhead * max_seqlen_k);
        // setup nhead_stride_* arguments
-        const ck_tile::index_t nhead_stride_q       = (i_perm ? shape_seqlen_q * hdim_q : hdim_q);
-        const ck_tile::index_t nhead_stride_k       = (i_perm ? shape_seqlen_k * hdim_q : hdim_q);
-        const ck_tile::index_t nhead_stride_v       = (i_perm ? shape_seqlen_k * hdim_v : hdim_v);
+        const ck_tile::index_t nhead_stride_q       = hdim_q;
+        const ck_tile::index_t nhead_stride_k       = hdim_q;
+        const ck_tile::index_t nhead_stride_v       = hdim_v;
        const ck_tile::index_t nhead_stride_bias    = 0;
-        const ck_tile::index_t nhead_stride_o       = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
+        const ck_tile::index_t nhead_stride_o       = hdim_v;
        const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k);
-        const ck_tile::index_t nhead_stride_do      = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
+        const ck_tile::index_t nhead_stride_do      = hdim_v;
        const ck_tile::index_t nhead_stride_lsed    = shape_seqlen_q;
        const ck_tile::index_t nhead_stride_dq_acc  = shape_seqlen_q * hdim_q;
        const ck_tile::index_t nhead_stride_dbias =
            (i_perm ? shape_seqlen_q * max_seqlen_k : max_seqlen_k);
        // setup batch_stride_* arguments
-        const ck_tile::index_t batch_stride_q       = (nhead * shape_seqlen_q * hdim_q);
-        const ck_tile::index_t batch_stride_k       = (nhead_k * shape_seqlen_k * hdim_q);
-        const ck_tile::index_t batch_stride_v       = (nhead_k * shape_seqlen_k * hdim_v);
+        const ck_tile::index_t batch_stride_q       = (i_perm? nhead * hdim_q : nhead * shape_seqlen_q * hdim_q);
+        const ck_tile::index_t batch_stride_k       = (i_perm? nhead_k * hdim_q : nhead_k * shape_seqlen_k * hdim_q);
+        const ck_tile::index_t batch_stride_v       = (i_perm? nhead_k * hdim_v : nhead_k * shape_seqlen_k * hdim_v);
        const ck_tile::index_t batch_stride_bias    = 0;
-        const ck_tile::index_t batch_stride_o       = (nhead * shape_seqlen_q * hdim_v);
+        const ck_tile::index_t batch_stride_o       = (o_perm? nhead * hdim_v : nhead * shape_seqlen_q * hdim_v);
        const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k);
-        const ck_tile::index_t batch_stride_do      = (nhead * shape_seqlen_q * hdim_v);
+        const ck_tile::index_t batch_stride_do      = (o_perm? nhead * hdim_v : nhead * shape_seqlen_q * hdim_v);
        const ck_tile::index_t batch_stride_lsed    = (nhead * shape_seqlen_q);
-        const ck_tile::index_t batch_stride_dk      = (nhead * shape_seqlen_k * hdim_q);
-        const ck_tile::index_t batch_stride_dv      = (nhead * shape_seqlen_k * hdim_v);
+        const ck_tile::index_t batch_stride_dk      = (i_perm? nhead * hdim_q : nhead * shape_seqlen_k * hdim_q);
+        const ck_tile::index_t batch_stride_dv      = (i_perm? nhead * hdim_q : nhead * shape_seqlen_k * hdim_v);
        const ck_tile::index_t batch_stride_dbias   = (nhead * shape_seqlen_q * max_seqlen_k);
+        const ck_tile::index_t batch_stride_dq_acc = (nhead*shape_seqlen_q*hdim_q);
        const ck_tile::index_t split_stride_dq_acc =
            (shape_batch * nhead * shape_seqlen_q * hdim_q);

@@ -574,7 +576,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             batch_stride_randval,
                             batch_stride_do,
                             batch_stride_lsed,
-                             batch_stride_q, // batch_stride_dq_acc
+                             batch_stride_dq_acc, // batch_stride_dq_acc
                             batch_stride_q, // batch_stride_dq
                             batch_stride_dk,
                             batch_stride_dv,
@@ -651,14 +653,14 @@ bool run(const ck_tile::ArgParser& arg_parser)

        // clang-format off
        // permute
-        if(i_perm) q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host(b, i[0], i[1] + query_offset, i[2]); });
+        if(i_perm) q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host(i[1] + query_offset, b, i[0], i[2]); });
        else       q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host(b, i[1] + query_offset, i[0], i[2]); });

-        if(i_perm) k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(b, i[0] / nr, i[1] + key_offset, i[2]); });
+        if(i_perm) k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(i[1] + key_offset, b, i[0] / nr, i[2]); });
        else       k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(b, i[1] + key_offset, i[0] / nr, i[2]); });

        // v_host_ref: [nhead, hdim, seq], v_host: [b, h_k, s, d]
-        if(i_perm) v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(b, i[0] / nr, i[2] + key_offset, i[1]); });
+        if(i_perm) v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(i[2] + key_offset, b, i[0] / nr, i[1]); });
        // v_host_ref: [nhead, hdim, seq], v_host: [b, s, h_k, d]
        else       v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(b, i[2] + key_offset, i[0] / nr, i[1]); });
        // clang-format on
@@ -799,7 +801,7 @@ bool run(const ck_tile::ArgParser& arg_parser)

        // clang-format off
        // permute
-        if(o_perm) o_host_ref.ForEach([&](auto& self, auto idx) { o_host(b, idx[0], idx[1] + query_offset, idx[2]) = self(idx); });
+        if(o_perm) o_host_ref.ForEach([&](auto& self, auto idx) { o_host(idx[1] + query_offset, b, idx[0], idx[2]) = self(idx); });
        else       o_host_ref.ForEach([&](auto& self, auto idx) { o_host(b, idx[1] + query_offset, idx[0], idx[2]) = self(idx); });

        lse_host_ref.ForEach([&](auto& self, auto idx) { lse_host(b, idx[0], idx[1] + query_offset) = self(idx); });
@@ -856,7 +858,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        ck_tile::HostTensor<VGradDataType> dv_host_ref({nhead, real_seqlen_k, hdim_v}); // dv_g_n_o

        // clang-format off
-        if(o_perm) do_host_ref.ForEach([&](auto& self, auto i) { self(i) = do_host(b, i[0], i[1] + query_offset, i[2]); });
+        if(o_perm) do_host_ref.ForEach([&](auto& self, auto i) { self(i) = do_host(i[1] + query_offset, b, i[0], i[2]); });
        else       do_host_ref.ForEach([&](auto& self, auto i) { self(i) = do_host(b, i[1] + query_offset, i[0], i[2]); });
        // clang-format on

@@ -936,13 +938,13 @@ bool run(const ck_tile::ArgParser& arg_parser)

        // clang-format off
        // permute
-        if(i_perm) dq_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dq_host(b, idx[0], idx[1] + query_offset, idx[2]); });
+        if(i_perm) dq_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dq_host(idx[1] + query_offset, b, idx[0], idx[2]); });
        else       dq_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dq_host(b, idx[1] + query_offset, idx[0], idx[2]); });

-        if(i_perm) dk_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dk_host(b, idx[0], idx[1] + key_offset, idx[2]); });
+        if(i_perm) dk_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dk_host(idx[1] + key_offset, b, idx[0], idx[2]); });
        else       dk_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dk_host(b, idx[1] + key_offset, idx[0], idx[2]); });

-        if(i_perm) dv_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dv_host(b, idx[0], idx[1] + key_offset, idx[2]); });
+        if(i_perm) dv_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dv_host(idx[1] + key_offset, b, idx[0], idx[2]); });
        else       dv_host_result.ForEach([&](auto& self, auto idx) {self(idx) = dv_host(b, idx[1] + key_offset, idx[0], idx[2]); });

        if(use_dbias)