update some description and fix format

3141f9f9 · carlushuang · 6451d7fa · 3141f9f9 · 3141f9f9 · 3141f9f9
Commit 3141f9f9 authored Oct 28, 2024 by carlushuang
3 changed files
--- a/example/ck_tile/02_layernorm2d/README.md
+++ b/example/ck_tile/02_layernorm2d/README.md
@@ -14,7 +14,7 @@ In training case the mean/variance need to store out (TBD, not supported yet)

 ![](misc/pnorm.png)

-since [prenorm/postnorm](https://arxiv.org/pdf/1906.01787) is quite common in LLM blocks, this example also support this feature. Note that `prenorm`/`postnorm` always need to fuse a `shortcut` before the actual layernorm computation, the only difference is whether to store the added element to global, `prenorm` need this. You can use `-fadd=1` to test `prenorm`(pre-add+store), or `-fadd=2` to test `postnorm`(pre-add)
+since [prenorm/postnorm](https://arxiv.org/pdf/1906.01787) is quite common in LLM blocks, this example boosts this feature by kernel fusion. Note that `prenorm`/`postnorm` always need to do elementwise-add a `shortcut` before the actual layernorm computation, and optionally store out the result to global. You can use `-fadd=1` to test `pre-add+store`, or `-fadd=2` to test `pre-add` without store out.

 ## build
 ```

--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
@@ -32,10 +32,7 @@ auto create_args(int argc, char* argv[])
        .insert("kname", "1", "print kernel name or not")
        .insert("prec_i", "fp16", "input precision")
        .insert("prec_o", "auto", "output precision, set auto will be the same as input")
-        .insert(
-            "fadd",
-            "0",
-            "fused-add, 0:no fused add, 1:fused-prenorm(preadd+store), 2:fused-postnorm(preadd)")
+        .insert("fadd", "0", "fused-add, 0:no fused add, 1:preadd+store, 2:preadd only")
        .insert("fsweep", "0", "fused-sweep")
        .insert("warmup", "5", "cold iter")
        .insert("repeat", "20", "hot iter");

--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp
@@ -10,12 +10,10 @@ namespace ck_tile {
 enum class Layernorm2dFusedAddEnum
 {
    NO_ADD = 0,
-    // fused add before layernorm (prenorm), and store result to global
+    // fused add before layernorm and store result to global
    PRE_ADD_STORE = 1,
-    PRE_NORM_ADD  = PRE_ADD_STORE,
-    // fused add before layernorm (postnorm), but not store result
-    PRE_ADD       = 2,
-    POST_NORM_ADD = PRE_ADD,
+    // fused add before layernorm, but not store result
+    PRE_ADD = 2,
 };

 // clang-format off