Merge branch 'gfx950' of https://github.com/ROCm/composable_kernel-internal into lwpck-2390

175a17f8 · Rostyslav Geyyer · 3e520bbd · 1504c3e8 · 175a17f8 · 175a17f8
Commit 175a17f8 authored Nov 23, 2024 by Rostyslav Geyyer
20 changed files
--- a/example/62_convnd_activ/multi_AB/CMakeLists.txt
+++ b/example/62_convnd_activ/multi_AB/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/unary/CMakeLists.txt
+++ b/example/62_convnd_activ/unary/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -54,6 +54,13 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
            list(REMOVE_ITEM FILE_NAME "${source}")
        endif()
    endforeach()
+    #Do not build any DPP examples if DL_KERNELS not set
+    foreach(source IN LISTS FILE_NAME)
+        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dpp")
+            message("removing dpp example ${source} ")
+            list(REMOVE_ITEM FILE_NAME "${source}")
+        endif()
+    endforeach()
    #Do not build any XDL examples if gfx9 targets are not on the list
    foreach(source IN LISTS FILE_NAME)
        if(NOT EX_TARGETS MATCHES "gfx9" AND source MATCHES "_xdl")
@@ -85,7 +92,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
    #only continue if there are some source files left on the list
    if(FILE_NAME)
        if(FILE_NAME MATCHES "_xdl")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
        elseif(FILE_NAME MATCHES "_wmma")
            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx950)
        endif()
@@ -169,7 +176,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    #only continue if there are some source files left on the list
    if(FILE_NAME)
        if(FILE_NAME MATCHES "_xdl")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
        elseif(FILE_NAME MATCHES "_wmma")
            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx950)
        endif()

--- a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
@@ -29,14 +29,14 @@ while getopts ":sa" opt; do
 done

 run_fp16_bf16_tests() {
-    local NUM_SPLITS=(1)
-    local PAGE_BLOCK_SIZE=(0)
-    local CACHE_BATCH_IDX=(0)
+    local NUM_SPLITS="1"
+    local PAGE_BLOCK_SIZE="0"
+    local CACHE_BATCH_IDX="0"

    if [ $TEST_SPLITKV -eq 1 ] ; then
-        NUM_SPLITS+=(2 3)
-        PAGE_BLOCK_SIZE+=(128)
-        CACHE_BATCH_IDX+=(1)
+        NUM_SPLITS="$NUM_SPLITS 2 3"
+        PAGE_BLOCK_SIZE="$PAGE_BLOCK_SIZE 128"
+        CACHE_BATCH_IDX="$CACHE_BATCH_IDX 1"
    fi

    for prec in "fp16" "bf16" ; do
@@ -47,9 +47,9 @@ run_fp16_bf16_tests() {
    for lse in 0 1 ; do
    for bias in "n" "e" "a" ; do
    for p_drop in 0.0 0.2 ; do
-    for num_splits in "${NUM_SPLITS[@]}" ; do
-    for page_block_size in "${PAGE_BLOCK_SIZE[@]}" ; do
-    for cache_batch_idx in "${CACHE_BATCH_IDX[@]}" ; do
+    for num_splits in $NUM_SPLITS ; do
+    for page_block_size in $PAGE_BLOCK_SIZE ; do
+    for cache_batch_idx in $CACHE_BATCH_IDX ; do

    # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
    $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16, -d_v=$hdim -s=55 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
@@ -103,4 +103,4 @@ if [ $TEST_APPENDKV -eq 1 ] ; then
    run_fp16_appendkv_tests
 fi

-set +x
\ No newline at end of file
+set +x
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
@@ -57,6 +57,7 @@ template <typename XDataType_,
          ck_tile::index_t Vector_N_,         // vector size along N
          bool kPadN_,
          bool kSaveMeanInvStd_,
+          bool kFastFDiv_,
          bool kTwoPass_,
          ck_tile::index_t kFusedAdd_ = 0,
          ck_tile::index_t kFusedQuant_ = 0>
@@ -118,6 +119,7 @@ struct layernorm2d_fwd_traits_

    static constexpr bool kPadN           = kPadN_;
    static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
+    static constexpr bool kFastFDiv       = kFastFDiv_;
    static constexpr bool kTwoPass        = kTwoPass_;
    static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_;
    static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_;
@@ -134,6 +136,7 @@ template <typename XDataType_,
          ck_tile::index_t Vector_N_,         // vector size along N
          bool kPadN_,
          bool kSaveMeanInvStd_,
+          bool kFastFDiv_,
          bool kTwoPass_,
          int  kFusedAdd_,
          int  kFusedQuant_>
@@ -148,6 +151,7 @@ using traits_ = layernorm2d_fwd_traits_<XDataType_,
                                       Vector_N_,
                                       kPadN_,
                                       kSaveMeanInvStd_,
+                                       kFastFDiv_,
                                       kTwoPass_,
                                       kFusedAdd_,
                                       kFusedQuant_>;
@@ -179,6 +183,7 @@ float layernorm2d_fwd_(const S& s, A a)

    using PipelineTraits = ck_tile::Layernorm2dFwdTraits<Traits_::kPadN,
        Traits_::kSaveMeanInvStd,
+        Traits_::kFastFDiv,
        Traits_::kTwoPass,
        static_cast<ck_tile::Layernorm2dFusedAddEnum>(Traits_::kFusedAdd),
        static_cast<ck_tile::Layernorm2dFusedQuantEnum>(Traits_::kFusedQuant)>;
@@ -269,7 +274,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
 #include "layernorm2d_fwd_api_common.hpp"

 // clang-format off
-//                                      prec_i           prec_o           prec_sy           rm  rn  tm    tn  vn  pd     mv     2p      add  sweep
+//                                      prec_i           prec_o           prec_sy           rm  rn  tm    tn  vn  pd     mv    rpcf    2p      add  sweep
 {F_instance_def}
 // clang-format on

@@ -356,6 +361,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
        F_Vector_N : int
        F_kPadN : bool
        F_kSaveMeanInvStd_ : bool
+        F_kFastFDiv_ : bool
        F_kTwoPass_ : bool
        F_kFusedAdd : int
        F_kFusedQuant : int
@@ -363,7 +369,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
        @property
        def trait_name(self) ->str:
            t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_XScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}'
-            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}'
+            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}, {BOOL_MAP(self.F_kFastFDiv_):5}'
            t_ += f', {BOOL_MAP(self.F_kTwoPass_):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}'
            return t_

@@ -483,52 +489,55 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
        fused_add_list = [0, 1]
        fused_sweep_list = [0, 1] # NOTE: only single pass can use fused dynamic quant

-        #                                                       rm  rn  tm   tn  vn  pd     mv     2p     add    sweep
-        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  4,  64, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  2, 128, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  2, 128, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  2, 128, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 1,  True,  False, False,   0,    0)],
-                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  2, 128, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 1,  True,  False, False,   0,    0)],
-                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1, 256, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1, 256, 1,  True,  False, False,   0,    0)],
-                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 128, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 1,  True,  False, False,   0,    0)],
-                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, False,   0,    0)],
-                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 512, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1,1024, 1,  True,  False, False,   0,    0)],
-                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 512, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1,1024, 1,  True,  False, False,   0,    0)],
-                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False,  True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False,  True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False,  True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False,  True,   0,    0)]}
+        #                                                       rm  rn  tm   tn  vn  pd     mv     fdiv  2p     add    sweep
+        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  8,  8,  8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 1,  True,  False, True, False,   0,    0)],
+                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 1,  True,  False, True, False,   0,    0)],
+                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 1,  True,  False, True, False,   0,    0)],
+                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  4,  64, 1,  True,  False, True, False,   0,    0)],
+                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  4,  64, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  4,  64, 1,  True,  False, True, False,   0,    0)],
+                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  2, 128, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  2, 128, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  2, 128, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 1,  True,  False, True, False,   0,    0)],
+                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  2, 128, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 1,  True,  False, True, False,   0,    0)],
+                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1, 256, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1, 256, 1,  True,  False, True, False,   0,    0)],
+                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 128, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 1,  True,  False, True, False,   0,    0)],
+                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True, False,   0,    0)],
+                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 512, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1,1024, 1,  True,  False, True, False,   0,    0)],
+                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 512, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1,1024, 1,  True,  False, True, False,   0,    0)],
+                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, True,  True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True,  True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, True,  True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True,  True,   0,    0)]}
        total_blob = list()
        for hs_key in h_trait_dict:
            hs = h_trait_dict[hs_key]

--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
@@ -25,7 +25,10 @@ auto create_args(int argc, char* argv[])
    ck_tile::ArgParser arg_parser;
    arg_parser.insert("m", "3328", "m dimension")
        .insert("n", "4096", "n dimension")
-        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("x_stride", "-1", "x row_stride, if -1 then equal to n")
+        .insert("xr_stride", "-1", "x residule row_stride, if -1 then equal to n")
+        .insert("y_stride", "-1", "y row_stride, if -1 then equal to n")
+        .insert("yr_stride", "-1", "y residule row_stride, if -1 then equal to n")
        .insert("e", "1e-5", "epsilon")
        .insert("save_mv", "0", "save mean/variance(invstd) or not. set to 1 in training case")
        .insert("v", "1", "cpu validation or not")
@@ -54,11 +57,20 @@ template <typename InDataType,
          bool SaveMeanVar>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
-    ck_tile::index_t m      = arg_parser.get_int("m");
-    ck_tile::index_t n      = arg_parser.get_int("n");
-    ck_tile::index_t stride = arg_parser.get_int("stride");
-    if(stride < 0)
-        stride = n;
+    ck_tile::index_t m        = arg_parser.get_int("m");
+    ck_tile::index_t n        = arg_parser.get_int("n");
+    ck_tile::index_t x_stride = arg_parser.get_int("x_stride");
+    if(x_stride < 0)
+        x_stride = n;
+    ck_tile::index_t xr_stride = arg_parser.get_int("xr_stride");
+    if(xr_stride < 0)
+        xr_stride = n;
+    ck_tile::index_t y_stride = arg_parser.get_int("y_stride");
+    if(y_stride < 0)
+        y_stride = n;
+    ck_tile::index_t yr_stride = arg_parser.get_int("yr_stride");
+    if(yr_stride < 0)
+        yr_stride = n;
    float epsilon       = arg_parser.get_float("e");
    std::string prec_i  = arg_parser.get_str("prec_i");
    std::string prec_o  = arg_parser.get_str("prec_o");
@@ -89,7 +101,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        return false;
    }

-    assert(stride >= n);
+    assert(x_stride >= n);

    using TypeConfig = LayerNormTypeConfig<InDataType, OutDataType, XScaleDataType, YScaleDataType>;

@@ -108,15 +120,15 @@ bool run(const ck_tile::ArgParser& arg_parser)
    using ComputeDataType = typename TypeConfig::ComputeDataType;

    // host verify
-    ck_tile::HostTensor<XDataType> x_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
    ck_tile::HostTensor<GammaDataType> gamma_host({n});
    ck_tile::HostTensor<BetaDataType> beta_host({n});

-    ck_tile::HostTensor<XResidualDataType> x_residual_host({m, n}, {stride, 1});
-    ck_tile::HostTensor<YResidualDataType> y_residual_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<XResidualDataType> x_residual_host({m, n}, {xr_stride, 1});
+    ck_tile::HostTensor<YResidualDataType> y_residual_host({m, n}, {yr_stride, 1});

-    ck_tile::HostTensor<YDataType> y_host_ref({m, n}, {stride, 1});
-    ck_tile::HostTensor<YDataType> y_host_dev({m, n}, {stride, 1});
+    ck_tile::HostTensor<YDataType> y_host_ref({m, n}, {y_stride, 1});
+    ck_tile::HostTensor<YDataType> y_host_dev({m, n}, {y_stride, 1});

    ck_tile::HostTensor<MeanDataType> mean_host_ref({m});
    ck_tile::HostTensor<InvStdDataType> invStd_host_ref({m});
@@ -162,7 +174,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
    }();

    std::cout << "[" << prec_str << "]"
-              << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush;
+              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+              << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
+              << ", yr_stride:" << yr_stride << std::flush;

    layernorm2d_fwd_traits traits{
        prec_i, prec_o, prec_sx, prec_sy, SaveMeanVar, fused_add, fused_quant};
@@ -182,7 +196,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
                              epsilon,
                              m,
                              n,
-                              stride};
+                              x_stride,   // x row_stride
+                              xr_stride,  // x residule row stride
+                              y_stride,   // y row stride
+                              yr_stride}; // y residule row stride

    float ave_time = layernorm2d_fwd(
        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
@@ -285,7 +302,7 @@ bool run(const ck_tile::ArgParser& arg_parser)

        y_buf.FromDevice(y_host_dev.data());

-        ck_tile::HostTensor<YResidualDataType> y_residual_host_dev({m, n}, {stride, 1});
+        ck_tile::HostTensor<YResidualDataType> y_residual_host_dev({m, n}, {yr_stride, 1});
        if(fused_add == 1)
        {
            y_residual_buf.FromDevice(y_residual_host_dev.data());
@@ -293,7 +310,7 @@ bool run(const ck_tile::ArgParser& arg_parser)

        auto [rtol, atol] = get_elimit<InDataType>();

-        if(stride == n)
+        if(x_stride == n)
        {
            pass = ck_tile::check_err(
                y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
@@ -310,10 +327,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
        {
            for(int i_r = 0; i_r < m; i_r++)
            {
-                std::vector<YDataType> y_host_dev_row(y_host_dev.begin() + i_r * stride,
-                                                      y_host_dev.begin() + i_r * stride + n);
-                std::vector<YDataType> y_host_ref_row(y_host_ref.begin() + i_r * stride,
-                                                      y_host_ref.begin() + i_r * stride + n);
+                std::vector<YDataType> y_host_dev_row(y_host_dev.begin() + i_r * y_stride,
+                                                      y_host_dev.begin() + i_r * y_stride + n);
+                std::vector<YDataType> y_host_ref_row(y_host_ref.begin() + i_r * y_stride,
+                                                      y_host_ref.begin() + i_r * y_stride + n);
                pass &= ck_tile::check_err(y_host_dev_row,
                                           y_host_ref_row,
                                           std::string("OUT[") + std::to_string(i_r) +
@@ -323,10 +340,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
                if(fused_add == 1)
                {
                    std::vector<YResidualDataType> y_residual_host_dev_row(
-                        y_residual_host_dev.begin() + i_r * stride,
-                        y_residual_host_dev.begin() + i_r * stride + n);
+                        y_residual_host_dev.begin() + i_r * yr_stride,
+                        y_residual_host_dev.begin() + i_r * yr_stride + n);
                    std::vector<YResidualDataType> y_residual_host_ref_row(
-                        x_host.begin() + i_r * stride, x_host.begin() + i_r * stride + n);
+                        x_host.begin() + i_r * yr_stride, x_host.begin() + i_r * yr_stride + n);
                    pass &= ck_tile::check_err(y_residual_host_dev_row,
                                               y_residual_host_ref_row,
                                               std::string("ADD[") + std::to_string(i_r) +

--- a/example/ck_tile/03_gemm/README.md
+++ b/example/ck_tile/03_gemm/README.md
@@ -8,7 +8,10 @@ This folder contains example for GEMM using ck_tile tile-programming implementat
 mkdir build && cd build
 # you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
 sh ../script/cmake-ck-dev.sh  ../ <arch>
+# The basic pipeline method on the gemm calculation
 make tile_example_gemm_basic -j
+# The memory bound pipeline on the gemm calculation
+make tile_example_gemm_mem_pipeline -j
 ```
 This will result in an executable `build/bin/tile_example_gemm_basic`


--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -17,10 +17,11 @@
 template <typename ALayout, typename BLayout, typename CLayout>
 float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
 {
-    // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
-    constexpr bool kPadA        = true;
-    constexpr bool kPadB        = true;
-    constexpr bool kPadC        = true;
+    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
    constexpr bool kTilePermute = false;
    // The rank and permutation will also be generate out by the CodeGen part.
    constexpr ck_tile::index_t kOutputRank = 2;
@@ -56,8 +57,8 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
        CShuffleEpilogue,
        ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<AccDataType,
                                                                   CDataType,
-                                                                   kPadA,
-                                                                   kPadB,
+                                                                   kPadM,
+                                                                   kPadN,
                                                                   kTilePermute,
                                                                   kOutputRank,
                                                                   1,
@@ -65,13 +66,13 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
                                                                   TilePartitioner::kM,
                                                                   TilePartitioner::kN>>,
        ck_tile::Default2DEpilogue<
-            ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadA, kPadB>>>;
+            ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>>;

    using CodegenGemmTraits =
-        ck_tile::TileGemmTraits<kPadA, kPadB, kPadC, ALayout, BLayout, CLayout>;
+        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
    using CodegenPipelineProblem = ck_tile::
        GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
-    using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy<ALayout, BLayout, CLayout>;
+    using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy;
    using CodegenGemmPipeline =
        ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem, CodegenGemmPolicy>;
    // ToDo: Will add the codegen part to test different pipeline policies in GEMM.

--- a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
+++ b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
@@ -31,9 +31,9 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
    constexpr ck_tile::index_t K_Warp_Tile = 8;

    // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
-    constexpr bool kPadA = true;
-    constexpr bool kPadB = true;
-    constexpr bool kPadC = true;
+    constexpr bool kPadM = true;
+    constexpr bool kPadN = true;
+    constexpr bool kPadK = true;

    constexpr int kBlockPerCu = 1;

@@ -46,9 +46,9 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
    using TilePartitioner = ck_tile::GemmTilePartitioner<GemmShape>;

    using GemmEpilogue = ck_tile::Default2DEpilogue<
-        ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, false, kPadC>>;
+        ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>;

-    using Traits = ck_tile::TileGemmTraits<kPadA, kPadB, kPadC, ALayout, BLayout, CLayout>;
+    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;

    using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<
        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>;

--- a/example/ck_tile/13_moe_sorting/CMakeLists.txt
+++ b/example/ck_tile/13_moe_sorting/CMakeLists.txt
+add_executable(tile_example_moe_sorting EXCLUDE_FROM_ALL moe_sorting.cpp moe_sorting_api.cpp)
+target_include_directories(tile_example_moe_sorting PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
+
+set(EXAMPLE_MOE_SORTING_COMPILE_OPTIONS)
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+list(APPEND EXAMPLE_MOE_SORTING_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+# list(APPEND EXAMPLE_MOE_SORTING_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
+target_compile_options(tile_example_moe_sorting PRIVATE ${EXAMPLE_MOE_SORTING_COMPILE_OPTIONS})
--- a/example/ck_tile/13_moe_sorting/README.md
+++ b/example/ck_tile/13_moe_sorting/README.md
+# moe-sorting
+
+This folder contains example for moe-sorting kernel using ck_tile tile-programming implementation. This kernel is often used in Moe model, before launching the fused-moe-gemm block. The input&weight is a `token*topk` 2d matrix. The op rearange the input weight ids into different experts and feed into fuse moe gemm kernel.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_example_moe_sorting -j
+```
+This will result in an executable `build/bin/tile_example_moe_sorting`
+
+## example
+```
+args:
+          -v    weather do CPU validation or not (default:1)
+       -pr_i    index data type. (currently only fp32 supported now) (default:int32)
+       -pr_w    output weight data type(currently only fp32 supported now) (default:fp32)
+          -t    number of input tokens (default:32)
+          -e    number of experts (default:8)
+          -k    topk (default:2)
+       -st_i    row stride of input, -1 means same as experts (default:-1)
+       -seed    seed to be used, -1 means random every time (default:-1)
+      -kname    when set to 1 it will print kernel name (default:0)
+
+```
--- a/example/ck_tile/13_moe_sorting/moe_sorting.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <set>
+#include <vector>
+#include <iostream>
+#include <numeric>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <time.h>
+#include <unordered_set>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "moe_sorting_api.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "weather do CPU validation or not")
+        .insert("pr_i", "int32", "index data type. (currently only int32 supported now)")
+        .insert("pr_w", "fp32", "output weight data type(currently only fp32 supported now)")
+        .insert("t", "128", "number of input tokens")
+        .insert("e", "8", "number of num_experts")
+        .insert("k", "4", "topk")
+        .insert("unit", "32", "unit_size")
+        .insert("moe_buf_size", "0", "moe_buf_size")
+        .insert("seed", "-1", "seed to be used, -1 means random every time")
+        .insert("kname", "0", "when set to 1 it will print kernel name")
+        .insert("warmup", "5", "number of iterations before benchmark the kernel")
+        .insert("repeat", "20", "number of iterations to benchmark the kernel");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename IndexType>
+void topid_unique_gen(
+    std::vector<IndexType>& host_tensor, int tokens, int topk, int num_expert, int seed)
+{
+    size_t total_size = topk * tokens;
+    std::srand(seed);
+    std::set<IndexType> unique_set;
+    IndexType current_v;
+    for(size_t i = 0; i < total_size; i++)
+    {
+        if(i % topk == 0)
+        {
+            unique_set.clear();
+        }
+        current_v = std::rand() % num_expert;
+        while(unique_set.find(current_v) != unique_set.end())
+        {
+            current_v = std::rand() % num_expert;
+        }
+        unique_set.insert(current_v);
+        host_tensor[i] = current_v;
+    }
+}
+
+template <typename WeightType, typename IndexType = ck_tile::index_t>
+bool test_moe_sorting(ck_tile::ArgParser args)
+{
+    int validate            = args.get_int("v");
+    std::string index_prec  = args.get_str("pr_i");
+    std::string weight_prec = args.get_str("pr_w");
+    int tokens              = args.get_int("t");
+    int num_experts         = args.get_int("e");
+    int topk                = args.get_int("k");
+    int seed                = args.get_int("seed");
+    int unit_size           = args.get_int("unit");
+    int moe_buf_size        = args.get_int("moe_buf_size");
+    int kname               = args.get_int("kname");
+    int warmup              = args.get_int("warmup");
+    int repeat              = args.get_int("repeat");
+    int max_output_ids =
+        ck_tile::integer_least_multiple(topk * tokens + num_experts * unit_size - topk, unit_size);
+
+    if(seed < 0)
+    {
+        seed = std::time(nullptr);
+    }
+
+    if(topk > num_experts)
+    {
+        printf("topk:%d value should be smaller than, or equal to number of num_experts:%d\n",
+               topk,
+               num_experts);
+        return false;
+    }
+
+    // tokens already considered batch size
+    ck_tile::HostTensor<IndexType> topk_ids_host({tokens, topk}, {topk, 1});
+    ck_tile::HostTensor<WeightType> weights_host({tokens, topk}, {topk, 1});
+    ck_tile::HostTensor<IndexType> sorted_ids_host({max_output_ids}, {1});
+    ck_tile::HostTensor<WeightType> sorted_weights_host({max_output_ids}, {1});
+    ck_tile::HostTensor<IndexType> sorted_expert_ids_host({max_output_ids / unit_size}, {1});
+    ck_tile::HostTensor<IndexType> sorted_id_cnt_host({1}, {1});
+    ck_tile::HostTensor<float> moe_buf_host({moe_buf_size});
+
+    ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(weights_host);
+    ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(moe_buf_host);
+    topid_unique_gen<IndexType>(topk_ids_host.mData, tokens, topk, num_experts, seed);
+
+    ck_tile::DeviceMem topk_ids_dev(topk_ids_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem weights_dev(weights_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sorted_ids_dev(sorted_ids_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sorted_weights_dev(sorted_weights_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sorted_expert_ids_dev(
+        sorted_expert_ids_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sorted_id_cnt_dev(sorted_id_cnt_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem moe_buf_dev(moe_buf_host.get_element_space_size_in_bytes());
+
+    topk_ids_dev.ToDevice(topk_ids_host.data());
+    weights_dev.ToDevice(weights_host.data());
+    if(moe_buf_size > 0)
+    {
+        moe_buf_dev.ToDevice(moe_buf_host.data());
+    }
+
+    moe_sorting_trait trait{index_prec, weight_prec};
+
+    moe_sorting_args karg{topk_ids_dev.GetDeviceBuffer(),
+                          weights_dev.GetDeviceBuffer(),
+                          sorted_ids_dev.GetDeviceBuffer(),
+                          sorted_weights_dev.GetDeviceBuffer(),
+                          sorted_expert_ids_dev.GetDeviceBuffer(),
+                          sorted_id_cnt_dev.GetDeviceBuffer(),
+                          moe_buf_size > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
+                          tokens,
+                          unit_size,
+                          num_experts,
+                          topk,
+                          static_cast<ck_tile::index_t>(moe_buf_size * sizeof(float))};
+
+    ck_tile::stream_config sc{nullptr,
+                              true,
+                              /* log_level = */ (kname ? 1 : 0),
+                              warmup,
+                              repeat};
+    auto ms = moe_sorting(trait, karg, sc);
+    printf("[%s|%s]tokens:%d, num_experts:%d, topk:%d,  ms:%f , ",
+           index_prec.c_str(),
+           weight_prec.c_str(),
+           tokens,
+           num_experts,
+           topk,
+           ms);
+    if(ms < 0)
+        printf("not supported\n");
+    fflush(stdout);
+    if(ms < 0)
+    {
+        return false;
+    }
+
+    sorted_ids_dev.FromDevice(sorted_ids_host.data());
+    sorted_weights_dev.FromDevice(sorted_weights_host.data());
+    sorted_expert_ids_dev.FromDevice(sorted_expert_ids_host.data());
+    sorted_id_cnt_dev.FromDevice(sorted_id_cnt_host.data());
+    if(moe_buf_size > 0)
+    {
+        moe_buf_dev.FromDevice(moe_buf_host.data());
+    }
+
+    bool rtn = true;
+    if(validate)
+    {
+        ck_tile::HostTensor<IndexType> sorted_ids_ref({max_output_ids}, {1});
+        ck_tile::HostTensor<WeightType> sorted_weights_ref({max_output_ids}, {1});
+        ck_tile::HostTensor<IndexType> sorted_expert_ids_ref({max_output_ids / unit_size}, {1});
+
+        int32_t ref_total_tokens_post_pad = 0;
+        ck_tile::reference_moe_sorting<WeightType, IndexType>(topk_ids_host,
+                                                              weights_host,
+                                                              sorted_ids_ref,
+                                                              sorted_weights_ref,
+                                                              sorted_expert_ids_ref,
+                                                              ref_total_tokens_post_pad,
+                                                              num_experts,
+                                                              unit_size);
+        rtn &= ck_tile::check_err(
+            sorted_ids_host, sorted_ids_ref, std::string("OUT Error: Incorrect ids!"), 1e-6, 1e-6);
+        rtn &= ck_tile::check_err(sorted_weights_host,
+                                  sorted_weights_ref,
+                                  std::string("OUT Error: Incorrect w!"),
+                                  1e-6,
+                                  1e-6);
+        rtn &= ck_tile::check_err(sorted_expert_ids_host,
+                                  sorted_expert_ids_ref,
+                                  std::string("OUT Error: Incorrect eid!"),
+                                  1e-6,
+                                  1e-6);
+        if(moe_buf_size)
+        {
+            ck_tile::HostTensor<WeightType> moe_buf_ref({moe_buf_size});
+            rtn &= ck_tile::check_err(
+                moe_buf_host, moe_buf_ref, std::string("OUT Error: Incorrect zero buf!"), 0, 0);
+        }
+        rtn &= ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0];
+    }
+
+    printf("valid:%s\n", rtn ? "y" : "n");
+    fflush(stdout);
+    return rtn;
+}
+
+int main(int argc, char** argv)
+{
+    auto [result, args] = create_args(argc, argv);
+    if(!result)
+        return -1;
+    std::string index_prec  = args.get_str("pr_i");
+    std::string weight_prec = args.get_str("pr_w");
+
+    bool r = true;
+    if(weight_prec.compare("fp32") == 0 && index_prec.compare("int32") == 0)
+    {
+        r &= test_moe_sorting<float, ck_tile::index_t>(args);
+    }
+    return r ? 0 : -1;
+}
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_sorting_api.hpp"
+
+#define MOE_SORTING_DISPATCH(unroll_num_)                                                   \
+    constexpr ck_tile::index_t unroll_num = unroll_num_;                                    \
+    using ms_problem     = ck_tile::MoeSortingProblem<index_t, ms_weight_type, unroll_num>; \
+    using kernel         = ck_tile::MoeSortingKernel<ms_problem>;                           \
+    auto kargs           = kernel::MakeKargs(a);                                            \
+    const dim3 grids     = kernel::GridSize(a);                                             \
+    const dim3 blocks    = kernel::BlockSize(a);                                            \
+    const auto lds_bytes = kernel::GetSmemSize(a);                                          \
+    float ave_time       = ck_tile::launch_kernel(                                          \
+        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs));          \
+    return ave_time;
+
+float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s)
+{
+    if(t.weight_type == "fp32" && t.index_type == "int32")
+    {
+        if(a.num_experts > 127)
+        {
+            printf("lds size exceed, only support experts <127 \n");
+            return -1;
+        }
+        if(a.moe_buf_bytes % 16)
+        {
+            printf("buf set size %d unaligned, must be multiple of 16\n", a.moe_buf_bytes);
+            return -1;
+        }
+        using index_t              = ck_tile::index_t;
+        using ms_weight_type       = float;
+        index_t smem_io_unroll_num = ck_tile::integer_divide_ceil(a.tokens * a.topk, 64);
+        switch(smem_io_unroll_num)
+        {
+        case(1): {
+            MOE_SORTING_DISPATCH(1);
+        }
+        case(2): {
+            MOE_SORTING_DISPATCH(2);
+        }
+        case(3): {
+            MOE_SORTING_DISPATCH(3);
+        }
+        case(5): {
+            MOE_SORTING_DISPATCH(5);
+        }
+        case(6): {
+            MOE_SORTING_DISPATCH(6);
+        }
+        case(7): {
+            MOE_SORTING_DISPATCH(7);
+        }
+        case(8): {
+            MOE_SORTING_DISPATCH(8);
+        }
+        case(9): {
+            MOE_SORTING_DISPATCH(9);
+        }
+        case(10): {
+            MOE_SORTING_DISPATCH(10);
+        }
+        case(11): {
+            MOE_SORTING_DISPATCH(11);
+        }
+        default: {
+            MOE_SORTING_DISPATCH(4);
+        }
+        }
+    }
+    return -1;
+}
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include <string>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/moe_sorting.hpp"
+
+struct moe_sorting_trait
+{
+    std::string index_type;
+    std::string weight_type; // currently always float
+};
+
+struct moe_sorting_args : public ck_tile::MoeSortingHostArgs
+{
+};
+
+float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s);
--- a/example/ck_tile/13_moe_sorting/script/smoke_test.sh
+++ b/example/ck_tile/13_moe_sorting/script/smoke_test.sh
+# #!/bin/sh
+
+EXE=./build/bin/tile_example_moe_sorting
+
+$EXE -t=80 -e=17 -moe_buf_size=16
+$EXE -t=111 -e=117 -moe_buf_size=4
+$EXE -t=1000 -e=55 -moe_buf_size=1024
+$EXE -t=99 -e=120  -moe_buf_size=10244
+$EXE -t=175 -e=64 -k=8
+$EXE -t=65 -e=8 -k=2
+$EXE -t=1 -e=25
+$EXE -t=31 -e=19 -k=15
+$EXE -t=81 -e=37 -k=7
+$EXE -t=23 -e=1 -k=1
+$EXE -t=127 -e=99 -k=19
+$EXE -t=71 -e=11 -k=11
+$EXE -t=1 -e=1 -k=1
+$EXE -t=99 -e=2 -k=1
+$EXE -t=333 -e=99 -k=13
\ No newline at end of file
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -12,3 +12,4 @@ add_subdirectory(09_topk_softmax)
 add_subdirectory(10_rmsnorm2d)
 add_subdirectory(11_add_rmsnorm2d_rdquant)
 add_subdirectory(12_smoothquant)
+add_subdirectory(13_moe_sorting)
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -63,13 +63,15 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
 #define __gfx101__
 #endif
 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__)
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \
+    defined(__gfx10_3_generic__)
 #define __gfx103__
 #endif
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__)
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
+    defined(__gfx1103__) || defined(__gfx11_generic__)
 #define __gfx11__
 #endif
-#if defined(__gfx1200__) || defined(__gfx1201__)
+#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
 #define __gfx12__
 #endif


--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
@@ -381,10 +381,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                        {
                            tildes = {i_ztilde, i_ytilde, i_xtilde};
                        }
-                        else
-                        {
-                            throw std::runtime_error("wrong! only implemented for 2D and 3D now");
-                        }

                        const auto a_grid_desc_ak0_m_ak1 =
                            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
@@ -750,6 +746,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
            }
        }

+        // check number of dimension, only implemented for 2D and 3D now
+        if(NDimSpatial != 2 && NDimSpatial != 3)
+        {
+            return false;
+        }
+
        return true;
    }


--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -549,8 +549,10 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, f8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, f8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, bf8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, fp8_storage_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
        "wrong! not implemented");

@@ -843,8 +845,8 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,

 #else

-    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
-        src_wave_buffer_resource, src_thread_addr_offset, 0);
+    vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+        src_wave_buffer_resource, src_thread_addr_offset, 0)};
    return src_thread_element_valid ? tmp : vector_t(0);
 #endif
 }
@@ -873,8 +875,8 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,

    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

-    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
-        src_wave_buffer_resource, src_thread_addr_offset, 0);
+    vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+        src_wave_buffer_resource, src_thread_addr_offset, 0)};

    return src_thread_element_valid ? tmp : vector_t(customized_value);
 }

--- a/include/ck/utility/amd_ck_fp8.hpp
+++ b/include/ck/utility/amd_ck_fp8.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/random_gen.hpp"
+#include "ck/utility/type.hpp"
+
+#ifdef CK_USE_FNUZ_FP8
+#define CK_USE_FNUZ_FP8 1
+#else
+#define CK_USE_FNUZ_FP8 0
+#endif
+
+#ifdef CK_USE_OCP_FP8
+#define CK_USE_OCP_FP8 1
+#else
+#define CK_USE_OCP_FP8 0
+#endif
+
+namespace ck {
+
+using f8_fnuz_t  = _BitInt(8);
+using bf8_fnuz_t = unsigned _BitInt(8);
+
+#if(defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx1200__) || \
+    defined(__gfx1201__) || defined(__gfx950__)) &&                                              \
+    __HIP_DEVICE_COMPILE__
+#define CK_FP8_CVT_FAST_PATH 1
+#else
+#define CK_FP8_CVT_FAST_PATH 0
+#endif
+
+#if(defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx950__)) && __HIP_DEVICE_COMPILE__
+#define CK_OCP_FP8_CVT_FAST_PATH 1
+#else
+#define CK_OCP_FP8_CVT_FAST_PATH 0
+#endif
+
+typedef unsigned char fp8_storage_t;
+
+/**
+ * \brief Describes FP8 interpretation
+ */
+enum class ck_fp8_interpretation_t
+{
+    CK_E4M3_OCP  = 0, // OCP E4M3
+    CK_E5M2_OCP  = 1, // OCP E5M2
+    CK_E4M3_FNUZ = 2, // FP8
+    CK_E5M2_FNUZ = 3, // BF8
+};
+
+/**
+ * \brief Describes saturation behavior
+ */
+enum class ck_saturation_t
+{
+    CK_NOSAT     = 0, // No saturation - replace with NaN or Inf
+    CK_SATFINITE = 1, // Saturate to finite
+};
+
+namespace fp8_impl {
+
+typedef fp8_storage_t fp8x2_storage_t __attribute__((ext_vector_type(2)));
+typedef float float2_t __attribute__((ext_vector_type(2)));
+
+__host__ __device__ static inline constexpr bool fnuz_f8_is_nan(f8_fnuz_t a)
+{
+    return static_cast<unsigned char>(a) == 0x80;
+}
+__host__ __device__ static inline constexpr bool fnuz_bf8_is_nan(bf8_fnuz_t a)
+{
+    return static_cast<unsigned char>(a) == 0x80;
+}
+
+__host__ __device__ static inline constexpr bool ocp_f8_is_nan(fp8_storage_t a)
+{
+    return (a & 0x7f) == 0x7f;
+}
+__host__ __device__ static inline constexpr bool ocp_bf8_is_nan(fp8_storage_t a)
+{
+    return (a & 0x7f) > 0x7c;
+}
+
+// The conversion function is from rocblas
+// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L220
+// This has been modified to handle double types as well
+template <typename T, int wm, int we, bool is_fnuz, bool clip = false>
+__host__ __device__ static inline T cast_from_f8(fp8_storage_t x)
+{
+    constexpr bool is_half   = __hip_internal::is_same<T, _Float16>::value;
+    constexpr bool is_float  = __hip_internal::is_same<T, float>::value;
+    constexpr bool is_double = __hip_internal::is_same<T, double>::value;
+    static_assert(is_half || is_float || is_double, "only half, float and double are supported");
+
+    constexpr int weo = is_half ? 5 : (is_float ? 8 : 11);
+    constexpr int wmo = is_half ? 10 : (is_float ? 23 : 52);
+
+    T fInf, fNegInf, fNaN, fNeg0, fmax, fmin;
+    if constexpr(is_half)
+    {
+        const unsigned short int ihInf    = 0x7C00;
+        const unsigned short int ihNegInf = 0xFC00;
+        const unsigned short int ihNaN    = 0x7C01;
+        const unsigned short int ihNeg0   = 0x8000;
+        /* Max number in e5m2 57344*/
+        const unsigned short int ifmax = 0x7B00;
+        const unsigned short int ifmin = 0xFB00;
+
+        fInf    = bit_cast<_Float16>(ihInf);
+        fNegInf = bit_cast<_Float16>(ihNegInf);
+        fNaN    = bit_cast<_Float16>(ihNaN);
+        fNeg0   = bit_cast<_Float16>(ihNeg0);
+        fmax    = bit_cast<_Float16>(ifmax);
+        fmin    = bit_cast<_Float16>(ifmin);
+    }
+    else if constexpr(is_float)
+    {
+        const unsigned int ifInf    = 0x7F800000;
+        const unsigned int ifNegInf = 0xFF800000;
+        const unsigned int ifNaN    = 0x7F800001;
+        const unsigned int ifNeg0   = 0x80000000;
+        /* Max number in e5m2 57344*/
+        const unsigned int ifmax = 0x47600000;
+        const unsigned int ifmin = 0xC7600000;
+
+        fInf    = bit_cast<float>(ifInf);
+        fNegInf = bit_cast<float>(ifNegInf);
+        fNaN    = bit_cast<float>(ifNaN);
+        fNeg0   = bit_cast<float>(ifNeg0);
+        fmax    = bit_cast<float>(ifmax);
+        fmin    = bit_cast<float>(ifmin);
+    }
+    else if constexpr(is_double)
+    {
+        const unsigned long long ifInf    = 0x7FF0000000000000ull;
+        const unsigned long long ifNegInf = 0xFFF0000000000000ull;
+        const unsigned long long ifNaN    = 0x7FF0000000000001ull;
+        const unsigned long long ifNeg0   = 0x8000000000000000ull;
+        /* Max number in e5m2 57344*/
+        const unsigned long long ifmax = 0x40EC000000000000ull;
+        const unsigned long long ifmin = 0xC0EC000000000000ull;
+
+        fInf    = bit_cast<double>(ifInf);
+        fNegInf = bit_cast<double>(ifNegInf);
+        fNaN    = bit_cast<double>(ifNaN);
+        fNeg0   = bit_cast<double>(ifNeg0);
+        fmax    = bit_cast<double>(ifmax);
+        fmin    = bit_cast<double>(ifmin);
+    }
+
+    if(x == 0)
+    {
+        return 0;
+    }
+
+    unsigned long long sign     = x >> 7;
+    unsigned long long mantissa = x & ((1 << wm) - 1);
+    int exponent                = (x & 0x7F) >> wm;
+    if constexpr(is_fnuz)
+    {
+        if(x == 0x80)
+        {
+            return fNaN;
+        }
+    }
+    else
+    {
+        if(x == 0x80)
+        {
+            return fNeg0;
+        }
+        if constexpr(we == 4)
+        { // e4m3
+            if((x & 0x7F) == 0x7F)
+            {
+                return fNaN;
+            }
+        }
+        else if((x & 0x7C) == 0x7C)
+        { // e5m2
+            if((x & 0x3) == 0)
+            {
+                if constexpr(clip)
+                {
+                    return sign ? fmin : fmax;
+                }
+                return sign ? fNegInf : fInf;
+            }
+            return fNaN;
+        }
+    }
+
+    typename __hip_internal::conditional<
+        sizeof(T) == 2,
+        unsigned short int,
+        typename __hip_internal::conditional<sizeof(T) == 4, unsigned int, unsigned long long>::
+            type>::type retval;
+
+    if constexpr(we == 5 && is_half && !is_fnuz)
+    {
+        retval = x << 8;
+        return bit_cast<T>(retval);
+    }
+
+    const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (is_fnuz ? 1 : 0);
+
+    // subnormal input
+    if(exponent == 0)
+    {
+#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__
+        // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+        int sh = 1 + __clz(mantissa) - (32 - wm);
+#else
+        int sh = 1 + __builtin_clz(mantissa) - (32 - wm);
+#endif
+        mantissa <<= sh;
+        exponent += 1 - sh;
+        mantissa &= ((1ull << wm) - 1);
+    }
+    exponent += exp_low_cutoff - 1;
+    mantissa <<= wmo - wm;
+
+    // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
+    if(exponent <= 0)
+    {
+        mantissa |= 1 << wmo;
+        mantissa >>= 1 - exponent;
+        exponent = 0;
+    }
+
+    if constexpr(sizeof(T) == 2)
+        retval = (sign << 15) | (exponent << 10) | mantissa;
+    else if constexpr(sizeof(T) == 4)
+        retval = (sign << 31) | (exponent << 23) | mantissa;
+    else
+        retval = (sign << 63) | (static_cast<unsigned long long>(exponent) << 52) | mantissa;
+
+    return bit_cast<T>(retval);
+}
+
+#if CK_FP8_CVT_FAST_PATH
+template <ck_fp8_interpretation_t interpret>
+static __device__ float cast_to_f32_from_f8(fp8_storage_t v)
+{
+    union
+    {
+        unsigned int i32val;
+        unsigned char i8val[4];
+    } val;
+    val.i8val[0] = v;
+
+    static_assert(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ ||
+                      interpret == ck_fp8_interpretation_t::CK_E4M3_OCP ||
+                      interpret == ck_fp8_interpretation_t::CK_E5M2_FNUZ ||
+                      interpret == ck_fp8_interpretation_t::CK_E5M2_OCP,
+                  "Only FNUZ and OCP interpretations are supported");
+
+    if constexpr((interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) ||
+                 (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP))
+    {
+        return __builtin_amdgcn_cvt_f32_fp8(val.i32val, 0);
+    }
+    else
+    {
+        return __builtin_amdgcn_cvt_f32_bf8(val.i32val, 0);
+    }
+}
+
+template <ck_fp8_interpretation_t interpret>
+static __device__ float2_t cast_to_f32x2_from_f8x2(fp8x2_storage_t v)
+{
+    const auto i16val = bit_cast<uint16_t>(v);
+
+    static_assert(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ ||
+                      interpret == ck_fp8_interpretation_t::CK_E4M3_OCP ||
+                      interpret == ck_fp8_interpretation_t::CK_E5M2_FNUZ ||
+                      interpret == ck_fp8_interpretation_t::CK_E5M2_OCP,
+                  "Only FNUZ and OCP interpretations are supported");
+
+    if constexpr((interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) ||
+                 (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP))
+    {
+        return __builtin_amdgcn_cvt_pk_f32_fp8(i16val, false);
+    }
+    else
+    {
+        return __builtin_amdgcn_cvt_pk_f32_bf8(i16val, false);
+    }
+}
+
+#endif
+
+} // namespace fp8_impl
+
+template <typename T, index_t N>
+struct non_native_vector_base;
+
+struct f8_ocp_t
+{
+    using data_type = fp8_storage_t;
+    data_type data;
+
+    static constexpr ck_saturation_t default_saturation = ck_saturation_t::CK_SATFINITE;
+    static constexpr ck_fp8_interpretation_t default_interpret =
+        ck_fp8_interpretation_t::CK_E4M3_OCP;
+
+    static constexpr unsigned int we = 4; // exponent width
+    static constexpr unsigned int wm = 3; // mantissa width
+
+    __host__ __device__ constexpr bool operator==(const f8_ocp_t& other) const
+    {
+        return (data == other.data) && (fp8_impl::ocp_f8_is_nan(data) == false); // NaN != NaN
+    }
+
+#if CK_USE_OCP_FP8
+    __host__ __device__ explicit operator float() const
+#else
+    __host__ explicit operator float() const
+#endif
+    {
+#if CK_OCP_FP8_CVT_FAST_PATH
+        return fp8_impl::cast_to_f32_from_f8<default_interpret>(this->data);
+#else
+        return fp8_impl::cast_from_f8<float, wm, we, false>(
+            this->data); // XXX: clip==false must be consistent with operator _Float16
+#endif
+    }
+
+#if CK_USE_OCP_FP8
+    __host__ __device__ explicit operator _Float16() const
+#else
+    __host__ explicit operator _Float16() const
+#endif
+    {
+#if CK_OCP_FP8_CVT_FAST_PATH
+        return static_cast<_Float16>(fp8_impl::cast_to_f32_from_f8<default_interpret>(this->data));
+#else
+        return fp8_impl::cast_from_f8<_Float16, wm, we, false>(
+            this->data); // XXX: clip==false must be consistent with operator float
+#endif
+    }
+};
+
+struct bf8_ocp_t
+{
+    using data_type = fp8_storage_t;
+    data_type data;
+
+    static constexpr ck_saturation_t default_saturation = ck_saturation_t::CK_SATFINITE;
+    static constexpr ck_fp8_interpretation_t default_interpret =
+        ck_fp8_interpretation_t::CK_E5M2_OCP;
+
+    static constexpr unsigned int we = 5; // exponent width
+    static constexpr unsigned int wm = 2; // mantissa width
+
+    __host__ __device__ constexpr bool operator==(const bf8_ocp_t& other) const
+    {
+        return (data == other.data) && (fp8_impl::ocp_bf8_is_nan(data) == false); // NaN != NaN
+    }
+
+#if CK_USE_OCP_FP8
+    __host__ __device__ explicit operator float() const
+
+#else
+    __host__ explicit operator float() const
+#endif
+    {
+#if defined(__gfx950__) || defined(__gfx1200__) || defined(__gfx1201__)
+        return fp8_impl::cast_to_f32_from_f8<default_interpret>(this->data);
+#else
+        return fp8_impl::cast_from_f8<float, wm, we, false>(
+            this->data); // XXX: clip==false must be consistent with operator _Float16
+#endif
+    }
+
+#if CK_USE_OCP_FP8
+    __host__ __device__ explicit operator _Float16() const
+#else
+    __host__ explicit operator _Float16() const
+#endif
+    {
+#if defined(__gfx950__) || defined(__gfx1200__) || defined(__gfx1201__)
+        return static_cast<_Float16>(fp8_impl::cast_to_f32_from_f8<default_interpret>(this->data));
+#else
+        return fp8_impl::cast_from_f8<_Float16, wm, we, false>(
+            this->data); // XXX: clip==false must be consistent with operator float
+#endif
+    }
+};
+
+template <index_t N>
+struct non_native_vector_base<f8_ocp_t, N>
+{
+    using data_t = f8_ocp_t::data_type;
+    static_assert(sizeof(f8_ocp_t) == sizeof(data_t),
+                  "non_native_vector_base storage size mismatch");
+    using data_v = data_t __attribute__((ext_vector_type(sizeof(data_t) * N)));
+    using type   = non_native_vector_base<f8_ocp_t, N>;
+
+    data_v d; // storage vector
+
+    __host__ __device__ non_native_vector_base() = default;
+    __host__ __device__ non_native_vector_base(data_t a) : d{a} {}
+    __host__ __device__ non_native_vector_base(f8_ocp_t f) : non_native_vector_base(f.data) {}
+    __host__ __device__ non_native_vector_base(data_v v) : d{v} {}
+
+    __host__ __device__ operator data_v() const { return d; }
+};
+
+template <>
+struct non_native_vector_base<f8_ocp_t, 1>
+{
+    using data_t = f8_ocp_t::data_type;
+    using data_v = data_t __attribute__((ext_vector_type(sizeof(data_t))));
+    using type   = non_native_vector_base<f8_ocp_t, 1>;
+
+    data_v d; // storage vector
+
+    __host__ __device__ non_native_vector_base() = default;
+    __host__ __device__ non_native_vector_base(data_t a) : d{a} {}
+    __host__ __device__ non_native_vector_base(f8_ocp_t f) : non_native_vector_base(f.data) {}
+    __host__ __device__ non_native_vector_base(data_v v) : d{v} {}
+
+    __host__ __device__ operator data_v() const { return d; }
+    __host__ __device__ operator data_t() const { return d[0]; }
+    __host__ __device__ operator f8_ocp_t() const { return f8_ocp_t{d[0]}; }
+};
+
+template <>
+struct non_native_vector_base<f8_ocp_t, 2>
+{
+    using data_t = f8_ocp_t::data_type;
+    using type   = non_native_vector_base<f8_ocp_t, 2>;
+    using data_v = fp8_impl::fp8x2_storage_t; // type of storage vector
+    data_v d;                                 // storage vector
+
+    __host__ __device__ non_native_vector_base() = default;
+    __host__ __device__ non_native_vector_base(data_t a) : d{a} {}
+    __host__ __device__ non_native_vector_base(f8_ocp_t f) : non_native_vector_base(f.data) {}
+    __host__ __device__ non_native_vector_base(data_v v) : d{v} {}
+
+    __host__ __device__ operator data_v() const { return d; }
+
+    using float2_t = fp8_impl::float2_t;
+
+#if CK_USE_OCP_FP8
+    __host__ __device__ explicit operator float2_t() const
+#else
+    __host__ explicit operator float2_t() const
+#endif
+    {
+#if CK_OCP_FP8_CVT_FAST_PATH
+        return fp8_impl::cast_to_f32x2_from_f8x2<f8_ocp_t::default_interpret>(d);
+#else
+        return float2_t{fp8_impl::cast_from_f8<float, f8_ocp_t::wm, f8_ocp_t::we, false>(d[0]),
+                        fp8_impl::cast_from_f8<float, f8_ocp_t::wm, f8_ocp_t::we, false>(d[1])};
+#endif
+    }
+};
+
+template <index_t N>
+struct non_native_vector_base<bf8_ocp_t, N>
+{
+    using data_t = bf8_ocp_t::data_type;
+    using data_v = data_t __attribute__((ext_vector_type(sizeof(data_t) * N)));
+    using type   = non_native_vector_base<bf8_ocp_t, N>;
+
+    data_v d; // storage vector
+
+    __host__ __device__ non_native_vector_base() = default;
+    __host__ __device__ non_native_vector_base(data_t a) : d{a} {}
+    __host__ __device__ non_native_vector_base(data_v v) : d{v} {}
+
+    __host__ __device__ operator data_v() const { return d; }
+};
+
+template <>
+struct non_native_vector_base<bf8_ocp_t, 1>
+{
+    using data_t = bf8_ocp_t::data_type;
+    using data_v = data_t __attribute__((ext_vector_type(sizeof(data_t))));
+    using type   = non_native_vector_base<bf8_ocp_t, 1>;
+
+    data_v d; // storage vector
+
+    __host__ __device__ non_native_vector_base() = default;
+    __host__ __device__ non_native_vector_base(data_t a) : d{a} {}
+    __host__ __device__ non_native_vector_base(bf8_ocp_t f) : non_native_vector_base(f.data) {}
+    __host__ __device__ non_native_vector_base(data_v v) : d{v} {}
+
+    __host__ __device__ operator data_v() const { return d; }
+    __host__ __device__ operator data_t() const { return d[0]; }
+    __host__ __device__ operator bf8_ocp_t() const { return bf8_ocp_t{d[0]}; }
+};
+
+template <typename T>
+__host__ __device__ static inline constexpr bool fp8_is_nan(T);
+
+template <>
+__host__ __device__ inline constexpr bool fp8_is_nan(f8_ocp_t a)
+{
+    return fp8_impl::ocp_f8_is_nan(a.data);
+}
+template <>
+__host__ __device__ inline constexpr bool fp8_is_nan(bf8_ocp_t a)
+{
+    return fp8_impl::ocp_bf8_is_nan(a.data);
+}
+template <>
+__host__ __device__ inline constexpr bool fp8_is_nan(f8_fnuz_t a)
+{
+    return fp8_impl::fnuz_f8_is_nan(a);
+}
+template <>
+__host__ __device__ inline constexpr bool fp8_is_nan(bf8_fnuz_t a)
+{
+    return fp8_impl::fnuz_bf8_is_nan(a);
+}
+
+template <typename T,
+          std::enable_if_t<std::is_same_v<T, bf8_ocp_t> || std::is_same_v<T, f8_ocp_t> ||
+                               std::is_same_v<T, bf8_fnuz_t> || std::is_same_v<T, f8_fnuz_t>,
+                           bool> = true>
+__host__ __device__ static inline constexpr bool fp8_is_inf(T)
+{
+    return false;
+}
+template <>
+__host__ __device__ inline constexpr bool fp8_is_inf(bf8_ocp_t a)
+{
+    return (a.data & 0x7f) == 0x7c;
+}
+
+namespace fp8_impl {
+
+// Assertions to check for supported conversion types
+#define __assert_ocp_support(interp)                                               \
+    {                                                                              \
+        if(interp != ck_fp8_interpretation_t::CK_E4M3_OCP &&                       \
+           interp != ck_fp8_interpretation_t::CK_E5M2_OCP)                         \
+        {                                                                          \
+            __hip_assert(false && "type is unsupported by current target device"); \
+        }                                                                          \
+    }
+#define __assert_fnuz_support(interp)                                              \
+    {                                                                              \
+        if(interp != ck_fp8_interpretation_t::CK_E4M3_FNUZ &&                      \
+           interp != ck_fp8_interpretation_t::CK_E5M2_FNUZ)                        \
+        {                                                                          \
+            __hip_assert(false && "type is unsupported by current target device"); \
+        }                                                                          \
+    }
+
+__host__ __device__ static inline void
+__is_interpret_supported([[maybe_unused]] ck_fp8_interpretation_t interp)
+{
+#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__
+#if CK_USE_OCP_FP8
+    __assert_ocp_support(interp);
+#endif
+#if CK_USE_FNUZ_FP8
+    __assert_fnuz_support(interp);
+#endif
+#endif
+}
+
+#if CK_FP8_CVT_FAST_PATH
+// The conversion function is from rocblas
+// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_float8.h#L79
+template <ck_fp8_interpretation_t interpret, bool saturate, bool stochastic_rounding = false>
+static __device__ fp8_storage_t cast_to_f8_from_f32(float v, unsigned int rng = 0)
+{
+    fp8_storage_t i8data;
+    union
+    {
+        float fval;
+        unsigned int i32val;
+        unsigned char i8val[4]; // NOTE: not endian independent
+    } val;
+
+    unsigned int ival = 0;
+    val.fval          = v;
+
+    if constexpr(saturate)
+    {
+        if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ)
+        {
+            if((val.i32val & 0x7F800000) != 0x7F800000)
+            { /// propagate NAN/INF, no clipping
+                val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0);
+            }
+        }
+        else if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+        { // OCP type
+            if((val.i32val & 0x7F800000) != 0x7F800000)
+            { /// propagate NAN/INF, no clipping
+                val.fval = __builtin_amdgcn_fmed3f(val.fval, 448.0, -448.0);
+            }
+        }
+        else
+        {
+            if((val.i32val & 0x7F800000) != 0x7F800000)
+            { /// propagate NAN/INF, no clipping
+                val.fval = __builtin_amdgcn_fmed3f(val.fval, 57344.0, -57344.0);
+            }
+        }
+    }
+
+    if constexpr(stochastic_rounding)
+    {
+        ival       = (interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) ||
+                       (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+                         ? __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0)
+                         : __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos
+        val.i32val = ival;
+        i8data     = val.i8val[0]; // little endian
+    }
+    else
+    { // RNE CVT
+        ival       = (interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) ||
+                       (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+                         ? __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false)
+                         : __builtin_amdgcn_cvt_pk_bf8_f32(val.fval,
+                                                     val.fval,
+                                                     ival,
+                                                     false); // false -> WORD0
+        val.i32val = ival;
+        i8data     = val.i8val[0];
+    }
+    return i8data;
+}
+#endif // CK_FP8_CVT_FAST_PATH
+
+// The conversion function is from rocblas
+// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L39
+// This has been modified to add double types conversion as well
+template <typename T, int wm, int we, bool is_fnuz, bool clip = false, bool stoch = false>
+__host__ __device__ static inline fp8_storage_t cast_to_f8(T _x, unsigned int rng = 0)
+{
+    constexpr bool is_half   = __hip_internal::is_same<T, _Float16>::value;
+    constexpr bool is_float  = __hip_internal::is_same<T, float>::value;
+    constexpr bool is_double = __hip_internal::is_same<T, double>::value;
+    static_assert(is_half || is_float || is_double,
+                  "Only half, float and double can be cast to f8");
+
+    constexpr int mfmt = (sizeof(T) == 8) ? 52 : ((sizeof(T) == 4) ? 23 : 10);
+
+    using T_bitwise = typename __hip_internal::conditional<
+        sizeof(T) == 2,
+        unsigned short int,
+        typename __hip_internal::conditional<sizeof(T) == 4, unsigned int, unsigned long long>::
+            type>::type;
+    T_bitwise x_bitwise = bit_cast<T_bitwise>(_x);
+
+    unsigned long long x{x_bitwise};
+
+    unsigned long long head, mantissa;
+    int exponent, bias;
+    unsigned int sign;
+    unsigned long long fInf, mask;
+
+    if constexpr(sizeof(T) == 8)
+    {
+        head     = x & 0xFFF0000000000000ull;
+        mantissa = x & 0xFFFFFFFFFFFFFull;
+        exponent = (head >> 52) & 0x7FF;
+        sign     = head >> 63;
+        bias     = 1023;
+        fInf     = 0x7FF0000000000000ull;
+        mask     = 0x7FFFFFFFFFFFFFFFull;
+    }
+    else if constexpr(sizeof(T) == 4)
+    {
+        head     = x & 0xFF800000;
+        mantissa = x & 0x7FFFFF;
+        exponent = (head >> 23) & 0xFF;
+        sign     = head >> 31;
+        bias     = 127;
+        fInf     = 0x7F800000;
+        mask     = 0x7FFFFFFF;
+    }
+    else
+    {
+        head     = x & 0xFC00;
+        mantissa = x & 0x3FF;
+        exponent = (head >> 10) & 0x1F;
+        sign     = head >> 15;
+        bias     = 15;
+        fInf     = 0x7C00;
+        mask     = 0x7FFF;
+    }
+    unsigned int signed_inf = 0;
+    unsigned int nan        = 0;
+    if constexpr(is_fnuz)
+    {
+        signed_inf = clip ? ((sign << 7) + 0x7f) : 0x80;
+        nan        = 0x80;
+    }
+    else
+    {
+        if constexpr(we == 4)
+        { // e4m3
+            signed_inf = (sign << 7) + (clip ? 0x7e : 0x7f);
+        }
+        else
+        { // e5m2
+            signed_inf = (sign << 7) + (clip ? 0x7b : 0x7c);
+        }
+        nan = (sign << 7) + 0x7f;
+    }
+    // Max values
+    unsigned long long ifmax = 0;
+    if constexpr(sizeof(T) == 8)
+    {
+        if constexpr(we == 5)
+        { // 57344
+            ifmax = 0x40EC000000000000ull;
+        }
+        else
+        {
+            if constexpr(is_fnuz)
+            { // 240
+                ifmax = 0x406E000000000000ull;
+            }
+            else
+            { // 448
+                ifmax = 0x407C000000000000ull;
+            }
+        }
+    }
+    else if(sizeof(T) == 4)
+    {
+        if constexpr(we == 5)
+        {
+            ifmax = 0x47600000;
+        }
+        else
+        {
+            if constexpr(is_fnuz)
+            {
+                ifmax = 0x43700000;
+            }
+            else
+            {
+                ifmax = 0x43E00000;
+            }
+        }
+    }
+    else
+    {
+        if constexpr(we == 5)
+        {
+            ifmax = 0x7B00;
+        }
+        else
+        {
+            if constexpr(is_fnuz)
+            {
+                ifmax = 0x5B80;
+            }
+            else
+            {
+                ifmax = 0x5F00;
+            }
+        }
+    }
+    // Deal with inf and NaNs
+    if((x & fInf) == fInf)
+    {
+        if constexpr(is_fnuz)
+            return signed_inf;
+
+        return mantissa != 0 ? nan : signed_inf;
+    }
+
+    if((x & mask) > ifmax)
+    {
+        return signed_inf;
+    }
+
+    if(x == 0)
+    {
+        return 0;
+    }
+
+    // First need to check if it is normal or denorm as there is a difference of
+    // implicit 1 Then need to adjust the exponent to align with the F8 exponent,
+    // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng
+    // to mantissa and truncate. And for RNE, no need to add rng. Then probably
+    // need to check whether there is carry and adjust exponent and mantissa again
+
+    // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent
+    // bits
+    const int f8_bias                  = (1 << (we - 1)) - 1 + (is_fnuz ? 1 : 0);
+    const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal
+    // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
+    // f8_exponent is the converted f8 exponent with bias encoding
+    // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
+    // the difference needs to be adjusted and mantissa shifted
+    int act_exponent, f8_exponent, exponent_diff;
+
+    if(exponent == 0)
+    { // fp32/fp16 is in denormal.
+        /* fp32 denormal is below 2^-127 so it is usually not a concern here, we
+    mostly concern fp16 here. In this case, f8 is usually in denormal. But there
+    could be exceptions. fp16 denormal has exponent bias 15 while bf8 with NANOO has
+    exponent bias 16. It means that there are some numbers in fp16 denormal but they
+    are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers
+    where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8
+    (NANOO) normal. In this case, the fp16 mantissa should be shift left by 1  */
+        act_exponent  = exponent - bias + 1;
+        exponent_diff = f8_denormal_act_exponent -
+                        act_exponent; // actual exponent is exponent-bias+1 as it is denormal
+    }
+    else
+    { // fp32/fp16 is normal with implicit 1
+        act_exponent = exponent - bias;
+        if(act_exponent <= f8_denormal_act_exponent)
+        {
+            /* This is the case where fp32/fp16 is normal but it is in f8 denormal
+      range. For example fp8 nanoo mode, denormal exponent is -7, but if the fp32/fp16
+      actual exponent is -7, it is actually larger due to the implicit 1,
+      Therefore it needs to be adjust to -6 and mantissa shift right by 1.
+      So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
+            exponent_diff = f8_denormal_act_exponent - act_exponent;
+        }
+        else
+        {                      // both fp32/fp16 and f8 are in normal range
+            exponent_diff = 0; // exponent_diff=0 does not mean there is no difference
+                               // for this case, act_exponent could be larger. Just
+                               // that it does not need shift mantissa
+        }
+        mantissa += (1ull << mfmt); // Add the implicit 1 into mantissa
+    }
+
+    bool midpoint = (mantissa & ((1ull << (mfmt - wm + exponent_diff)) - 1)) ==
+                    (1ull << (mfmt - wm + exponent_diff - 1));
+    /* This part is a bit tricky. The judgment of whether it is a tie needs to be
+  done before we shift right as shift right could rip off some residual part and
+  make something not midpoint look like midpoint. For example, the fp16 number
+  0x1002 (0 00100 0000000010), it is larger than midpoint, but after shift right
+  by 4 bits, it would look like midpoint.
+  */
+
+    if(exponent_diff > 0)
+        mantissa >>= exponent_diff;
+    else if(exponent_diff == -1)
+        mantissa <<= -exponent_diff;
+    bool implicit_one = mantissa & (1ull << mfmt);
+    // if there is no implicit 1, it  means the f8 is denormal and need to adjust
+    // to denorm exponent
+    f8_exponent =
+        (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1);
+
+    // Now we have the exponent and mantissa adjusted
+    unsigned long long drop_mask = (1ull << (mfmt - wm)) - 1;
+    bool odd =
+        mantissa & (1ull << (mfmt - wm)); // if the least significant bit that is not truncated is 1
+    mantissa +=
+        (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1ull) : mantissa)) & drop_mask;
+
+    // Now we deal with overflow
+    if(f8_exponent == 0)
+    {
+        if((1ull << mfmt) & mantissa)
+        {
+            f8_exponent = 1; // denormal overflow to become normal, promote exponent
+        }
+    }
+    else
+    {
+        if((1ull << (mfmt + 1)) & mantissa)
+        {
+            mantissa >>= 1;
+            f8_exponent++;
+        }
+    }
+
+    mantissa >>= (mfmt - wm);
+
+    // above range: quantize to maximum possible float of the same sign
+    const int max_exp = (1 << we) - 1;
+    if(f8_exponent > max_exp)
+    {
+        if constexpr(clip)
+        {
+            mantissa    = (1 << wm) - 1;
+            f8_exponent = max_exp;
+        }
+        else
+        {
+            return signed_inf;
+        }
+    }
+
+    if(f8_exponent == 0 && mantissa == 0)
+        return is_fnuz ? 0 : (sign << 7);
+    mantissa &= (1 << wm) - 1;
+    return (sign << 7) | (f8_exponent << wm) | mantissa;
+}
+
+/**
+ * \brief convert float to @p fp8_storage_t
+ *
+ * \tparam interp interpretation of fp8
+ * \tparam sat saturation of fp8
+ * \param f float number
+ * \return fp8_storage_t
+ */
+template <ck_fp8_interpretation_t interp,
+          ck_saturation_t sat      = ck_saturation_t::CK_SATFINITE,
+          bool stochastic_rounding = false>
+#if CK_FP8_CVT_FAST_PATH
+__host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
+{
+    __is_interpret_supported(interp);
+    uint32_t rng = 0;
+    if constexpr(stochastic_rounding)
+    {
+        constexpr int seed = 1254739;
+        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+    }
+    return cast_to_f8_from_f32<interp, sat == ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
+        f, rng);
+#else
+#if CK_USE_OCP_FP8
+__host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
+{
+#else
+__host__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
+{
+#endif
+    uint32_t rng = 0;
+    if constexpr(stochastic_rounding)
+    {
+        constexpr int seed = 1254739;
+        rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+    }
+
+    if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_FNUZ)
+    {
+        return cast_to_f8<float,
+                          3,
+                          4,
+                          true,
+                          sat == ck_saturation_t::CK_SATFINITE,
+                          stochastic_rounding>(f, rng);
+    }
+    else if constexpr(interp == ck_fp8_interpretation_t::CK_E5M2_FNUZ)
+    {
+        return cast_to_f8<float,
+                          2,
+                          5,
+                          true,
+                          sat == ck_saturation_t::CK_SATFINITE,
+                          stochastic_rounding>(f, rng);
+    }
+    else if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_OCP)
+    {
+        return cast_to_f8<float,
+                          3,
+                          4,
+                          false,
+                          sat == ck_saturation_t::CK_SATFINITE,
+                          stochastic_rounding>(f, rng);
+    }
+    else if constexpr(interp == ck_fp8_interpretation_t::CK_E5M2_OCP)
+    {
+        return cast_to_f8<float,
+                          2,
+                          5,
+                          false,
+                          sat == ck_saturation_t::CK_SATFINITE,
+                          stochastic_rounding>(f, rng);
+    }
+    else
+    {
+        __hip_assert(false && "FP8 type is not supported by current target device");
+        return 0;
+    }
+#endif // CK_FP8_CVT_FAST_PATH
+}
+
+/**
+ * \brief convert _Float16 to @p fp8_storage_t
+ *
+ * \tparam sat saturation of fp8
+ * \tparam interp interpretation of fp8
+ * \tparam stochastic_rounding switch between RNE and SR
+ * \param x _Float16 value
+ * \return fp8_storage_t
+ */
+template <ck_fp8_interpretation_t interp,
+          ck_saturation_t sat      = ck_saturation_t::CK_SATFINITE,
+          bool stochastic_rounding = false>
+#if CK_FP8_CVT_FAST_PATH || CK_USE_OCP_FP8
+__host__ __device__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16 x)
+#else
+__host__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16 x)
+#endif
+{
+    return cvt_float_to_fp8<interp, sat, stochastic_rounding>(static_cast<float>(x));
+}
+
+} // namespace fp8_impl
+
+// Declare a template function for fp8 conversion using RNE
+template <typename Y, typename X>
+__host__ __device__ constexpr Y f8_convert_rne(X x);
+
+// convert fp32 to fp8 with rounding to nearest even
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, float>(float x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+// convert fp32 to bf8 with rounding to nearest even
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, float>(float x)
+{
+    return bf8_ocp_t{
+        fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(x)};
+}
+
+// convert _Float16 to fp8 with rounding to nearest even
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, _Float16>(_Float16 x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, _Float16>(_Float16 x)
+{
+    return bf8_ocp_t{
+        fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(
+            x)};
+}
+
+// Declare a template function for fp8 conversion using RNE
+template <typename Y, typename X>
+__host__ __device__ constexpr Y f8_convert_sr(X x);
+
+// convert fp32 to fp8 with stochastic rounding
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, float>(float x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation, true>(
+            x)};
+}
+
+// convert fp32 to bf8 with stochastic rounding
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, float>(float x)
+{
+    return bf8_ocp_t{fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret,
+                                                bf8_ocp_t::default_saturation,
+                                                true>(x)};
+}
+
+// convert _Float16 to fp8 with stochastic rounding
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, _Float16>(_Float16 x)
+{
+    return f8_ocp_t{fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret,
+                                                f8_ocp_t::default_saturation,
+                                                true>(x)};
+}
+
+// convert _Float16 to bf8 with stochastic rounding
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, _Float16>(_Float16 x)
+{
+    return bf8_ocp_t{fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret,
+                                                 bf8_ocp_t::default_saturation,
+                                                 true>(x)};
+}
+
+#if CK_USE_OCP_FP8
+using f8_t  = f8_ocp_t;
+using bf8_t = bf8_ocp_t;
+#define CK_FP8_TYPE_FNUZ 0
+#define CK_FP8_TYPE_OCP 1
+#else
+using f8_t = f8_fnuz_t;
+using bf8_t = bf8_fnuz_t;
+#define CK_FP8_TYPE_FNUZ 1
+#define CK_FP8_TYPE_OCP 0
+#endif
+
+} // namespace ck