Merge branch 'gemm_bf16_sk_muozturk' of...

Merge branch 'gemm_bf16_sk_muozturk' of https://github.com/ROCm/composable_kernel into gemm_bf16_sk_muozturk

Merge branch 'gemm_bf16_sk_muozturk' of...
Merge branch 'gemm_bf16_sk_muozturk' of https://github.com/ROCm/composable_kernel into gemm_bf16_sk_muozturk
eca84f93 · root · 6f210155 · c256f018 · eca84f93 · eca84f93
Commit eca84f93 authored Dec 19, 2024 by root
20 changed files
--- a/codegen/test/common.hpp
+++ b/codegen/test/common.hpp
--- a/codegen/test/rtc/CMakeLists.txt
+++ b/codegen/test/rtc/CMakeLists.txt
+find_package(hip)
 file(GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp)
 add_library(ck_rtc ${RTC_SOURCES})
 target_include_directories(ck_rtc PUBLIC include)
 target_link_libraries(ck_rtc PUBLIC hip::host)
+target_link_libraries(ck_rtc PUBLIC -lstdc++fs)
--- a/codegen/test/rtc/include/rtc/compile_kernel.hpp
+++ b/codegen/test/rtc/include/rtc/compile_kernel.hpp
@@ -2,14 +2,14 @@
 #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
 #include <rtc/kernel.hpp>
-#include <ck/filesystem.hpp>
+#include <rtc/filesystem.hpp>
 #include <string>
 namespace rtc {
 struct src_file
 {
-    CK::fs::path path;
+    fs::path path;
    std::string_view content;
 };

--- a/codegen/test/rtc/include/rtc/filesystem.hpp
+++ b/codegen/test/rtc/include/rtc/filesystem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#ifndef GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
+#define GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
+#include <string>
+#include <string_view>
+// clang-format off
+#if defined(CPPCHECK)
+  #define RTC_HAS_FILESYSTEM 1
+  #define RTC_HAS_FILESYSTEM_TS 1
+#elif defined(_WIN32)
+  #if _MSC_VER >= 1920
+    #define RTC_HAS_FILESYSTEM 1
+    #define RTC_HAS_FILESYSTEM_TS 0
+  #elif _MSC_VER >= 1900
+    #define RTC_HAS_FILESYSTEM 0
+    #define RTC_HAS_FILESYSTEM_TS 1
+  #else
+    #define RTC_HAS_FILESYSTEM 0
+    #define RTC_HAS_FILESYSTEM_TS 0
+  #endif
+#elif defined(__has_include)
+  #if __has_include(<filesystem>) && __cplusplus >= 201703L
+    #define RTC_HAS_FILESYSTEM 1
+  #else
+    #define RTC_HAS_FILESYSTEM 0
+  #endif
+  #if __has_include(<experimental/filesystem>) && __cplusplus >= 201103L
+    #define RTC_HAS_FILESYSTEM_TS 1
+  #else
+    #define RTC_HAS_FILESYSTEM_TS 0
+  #endif
+#else
+  #define RTC_HAS_FILESYSTEM 0
+  #define RTC_HAS_FILESYSTEM_TS 0
+#endif
+// clang-format on
+#if RTC_HAS_FILESYSTEM
+#include <filesystem>
+#elif RTC_HAS_FILESYSTEM_TS
+#include <experimental/filesystem>
+#else
+#error "No filesystem include available"
+#endif
+namespace rtc {
+#if RTC_HAS_FILESYSTEM
+namespace fs = ::std::filesystem;
+#elif RTC_HAS_FILESYSTEM_TS
+namespace fs = ::std::experimental::filesystem;
+#endif
+} // namespace rtc
+#endif // GUARD_RTC_FILESYSTEM_HPP_
--- a/codegen/test/rtc/include/rtc/hip.hpp
+++ b/codegen/test/rtc/include/rtc/hip.hpp
@@ -4,6 +4,7 @@
 #include <hip/hip_runtime_api.h>
 #include <memory>
 #include <string>
+#include <stdexcept>
 namespace rtc {

--- a/codegen/test/rtc/include/rtc/tmp_dir.hpp
+++ b/codegen/test/rtc/include/rtc/tmp_dir.hpp
@@ -2,13 +2,13 @@
 #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
 #include <string>
-#include <ck/filesystem.hpp>
+#include <rtc/filesystem.hpp>
 namespace rtc {
 struct tmp_dir
 {
-    CK::fs::path path;
+    fs::path path;
    tmp_dir(const std::string& prefix = "");
    void execute(const std::string& cmd) const;

--- a/codegen/test/rtc/src/compile_kernel.cpp
+++ b/codegen/test/rtc/src/compile_kernel.cpp
-#include "rtc/hip.hpp"
+#include <rtc/hip.hpp>
 #include <rtc/compile_kernel.hpp>
 #include <rtc/tmp_dir.hpp>
 #include <stdexcept>
@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
    for(const auto& src : srcs)
    {
-        CK::fs::path full_path   = td.path / src.path;
+        fs::path full_path   = td.path / src.path;
-        CK::fs::path parent_path = full_path.parent_path();
+        fs::path parent_path = full_path.parent_path();
-        CK::fs::create_directories(parent_path);
+        fs::create_directories(parent_path);
        write_string(full_path.string(), src.content);
        if(src.path.extension().string() == ".cpp")
        {
@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
    td.execute(compiler() + options.flags);
    auto out_path = td.path / out;
-    if(not CK::fs::exists(out_path))
+    if(not fs::exists(out_path))
        throw std::runtime_error("Output file missing: " + out);
    auto obj = read_buffer(out_path.string());

--- a/codegen/test/rtc/src/tmp_dir.cpp
+++ b/codegen/test/rtc/src/tmp_dir.cpp
@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix)
 }
 tmp_dir::tmp_dir(const std::string& prefix)
-    : path(CK::fs::temp_directory_path() /
+    : path(fs::temp_directory_path() /
           unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix))
 {
-    CK::fs::create_directories(this->path);
+    fs::create_directories(this->path);
 }
 void tmp_dir::execute(const std::string& cmd) const
@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const
    std::system(s.c_str());
 }
-tmp_dir::~tmp_dir() { CK::fs::remove_all(this->path); }
+tmp_dir::~tmp_dir() { fs::remove_all(this->path); }
 } // namespace rtc
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==1.8.2
+rocm-docs-core==1.12.0
 sphinxcontrib-bibtex==2.6.3
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.8.2
+rocm-docs-core==1.12.0
    # via -r requirements.in
 six==1.16.0
    # via pybtex

--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -80,9 +80,16 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8)
 add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8)
+add_example_executable(example_gemm_xdl_fp8_streamk_v3 gemm_xdl_fp8_streamk_v3.cpp)
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_streamk_v3)
 add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8)
 add_custom_target(example_gemm_wmma)
 add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp)
 add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
+add_example_executable(example_gemm_wmma_bf16 gemm_wmma_bf16.cpp)
+add_example_dependencies(example_gemm_wmma example_gemm_wmma_bf16)
+add_example_executable(example_gemm_wmma_int8 gemm_wmma_int8.cpp)
+add_example_dependencies(example_gemm_wmma example_gemm_wmma_int8)
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -29,9 +29,9 @@ struct ProblemSize final
    ck::index_t N = 4096;
    ck::index_t K = 4096;
-    ck::index_t StrideA = 0;
+    ck::index_t StrideA = -1;
-    ck::index_t StrideB = 0;
+    ck::index_t StrideB = -1;
-    ck::index_t StrideC = 0;
+    ck::index_t StrideC = -1;
 };
 struct ProblemSizeStreamK final
@@ -40,11 +40,11 @@ struct ProblemSizeStreamK final
    ck::index_t N = 4096;
    ck::index_t K = 4096;
-    ck::index_t StrideA = 0;
+    ck::index_t StrideA = -1;
-    ck::index_t StrideB = 0;
+    ck::index_t StrideB = -1;
-    ck::index_t StrideC = 0;
+    ck::index_t StrideC = -1;
-    ck::index_t NumSKBlocks = -1;
+    ck::index_t NumSKBlocks = -1; // number of stream-k blocks
 };
 struct ProblemSizeStreamK_universal final
 {
@@ -52,9 +52,9 @@ struct ProblemSizeStreamK_universal final
    ck::index_t N = 4096;
    ck::index_t K = 4096;
-    ck::index_t StrideA = 0;
+    ck::index_t StrideA = -1;
-    ck::index_t StrideB = 0;
+    ck::index_t StrideB = -1;
-    ck::index_t StrideC = 0;
+    ck::index_t StrideC = -1;
    ck::index_t Grid_size   = -1; // defaults to max occupancy
    ck::index_t Streamk_sel = 1;  // defaults to 1-tile SK
@@ -66,18 +66,19 @@ struct ProblemSizeSplitK final
    ck::index_t N = 4096;
    ck::index_t K = 4096;
-    ck::index_t StrideA = 0;
+    ck::index_t StrideA = -1;
-    ck::index_t StrideB = 0;
+    ck::index_t StrideB = -1;
-    ck::index_t StrideC = 0;
+    ck::index_t StrideC = -1;
    ck::index_t KBatch = 1;
 };
 struct ExecutionConfig final
 {
-    bool do_verification = true;
+    // 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU
-    int init_method      = 2;
+    int do_verification = 1;
-    bool time_kernel     = false;
+    int init_method     = 2;
+    bool time_kernel    = false;
 };
 template <ck::index_t... Is>
@@ -126,7 +127,7 @@ bool parse_cmd_args<ProblemSize>(int argc,
    }
    else
    {
-        std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+        std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
                  << std::endl
                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
@@ -176,7 +177,7 @@ bool parse_cmd_args<ProblemSizeStreamK_universal>(int argc,
    else
    {
        std::cerr
-            << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+            << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
            << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl
            << "arg3: time kernel (0=no, 1=yes)" << std::endl
            << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
@@ -225,7 +226,7 @@ bool parse_cmd_args<ProblemSizeStreamK>(int argc,
    }
    else
    {
-        std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+        std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
                  << std::endl
                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
@@ -275,7 +276,7 @@ bool parse_cmd_args<ProblemSizeSplitK>(int argc,
    }
    else
    {
-        std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+        std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
                  << std::endl
                  << "arg3: time kernel (0=no, 1=yes)" << std::endl

--- a/example/01_gemm/gemm_wmma_bf16.cpp
+++ b/example/01_gemm/gemm_wmma_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
+using ADataType        = ck::bhalf_t;
+using BDataType        = ck::bhalf_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using CDataType        = ck::bhalf_t;
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
+         < ALayout,
+           BLayout,
+           CLayout,
+           ADataType,
+           BDataType,
+           CDataType,
+           AccDataType,
+           CShuffleDataType,
+           AElementOp,
+           BElementOp,
+           CElementOp,
+           GemmDefault,
+           1,           // Prefetch stage
+           128,         // BlockSize
+           64,          // MPerBlock
+           128,         // NPerBlock
+           64,          // KPerBlock
+           2,           // K1
+           16,          // MPerWmma
+           16,          // NPerWmma
+           2,           // M-Repeat // M-PerWmma / M-Repeat = M-Wave
+           4,           // N-Repeat // N-PerWmma / N-Repeat = N-Wave
+           S<4, 32, 1>,
+           S<1, 0, 2>,
+           S<1, 0, 2>,
+           2,
+           2,
+           2,
+           true,
+           S<4, 32, 1>,
+           S<1, 0, 2>,
+           S<1, 0, 2>,
+           2,
+           2,
+           2,
+           true,
+           1,           // C shuffle (M Repeat) Per store
+           1,           // C shuffle (N Repeat) Per store
+           S<1, 32, 1,  4>,
+           8>;
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+#include "run_gemm_example.inc"
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_wmma_int8.cpp
+++ b/example/01_gemm/gemm_wmma_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
+using ADataType        = int8_t;
+using BDataType        = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int32_t;
+using CDataType        = int8_t;
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
+         < ALayout,
+           BLayout,
+           CLayout,
+           ADataType,
+           BDataType,
+           CDataType,
+           AccDataType,
+           CShuffleDataType,
+           AElementOp,
+           BElementOp,
+           CElementOp,
+           GemmDefault,
+           1,           // Prefetch stage
+           128,         // BlockSize
+           64,          // MPerBlock
+           128,         // NPerBlock
+           64,          // KPerBlock
+           2,           // K1
+           16,          // MPerWmma
+           16,          // NPerWmma
+           2,           // M-Repeat // M-PerWmma / M-Repeat = M-Wave
+           4,           // N-Repeat // N-PerWmma / N-Repeat = N-Wave
+           S<4, 32, 1>,
+           S<1, 0, 2>,
+           S<1, 0, 2>,
+           2,
+           2,
+           2,
+           true,
+           S<4, 32, 1>,
+           S<1, 0, 2>,
+           S<1, 0, 2>,
+           2,
+           2,
+           2,
+           true,
+           1,           // C shuffle (M Repeat) Per store
+           1,           // C shuffle (N Repeat) Per store
+           S<1, 32, 1,  4>,
+           8>;
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+#include "run_gemm_example.inc"
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp
+++ b/example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp
@@ -23,14 +23,36 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 // // clang-format off
 // using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
-// // ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+// // ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData| CShuffle| A|
-// // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+// B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl|
-// // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+// NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|
-// // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+// ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer|
-//          < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|
+// CBlockTransferClusterLengths|  CBlockTransfer|
+// // ######|        |        |        |      Type|      Type|      Type|        Type| DataType|
+// Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |
+// |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|
+// SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|
+// SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|
+// _MBlock_MWaveMPerXdl| ScalarPerVector|
+// // ######|        |        |        |          |          |          |            | | Operation|
+// Operation|   Operation|               |    Stage|      |      |      |      |    |    |     | |
+// Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|
+// PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |
+// PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|
+// _NWaveNPerXdl|
+// // ######|        |        |        |          |          |          |            | | | | | | |
+// |      |      |      |    |    |     |     |     |     |                |               | | | |
+// |          |                |               |               |              |               | | |
+// |            |                             |                |
+//          < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType,
+//          CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1, 256,
+//          256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,
+//          S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,
+//          S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1, 1,
+//          1,               S<1, 32, 1, 8>,               8>;
 // // clang-format on
 // clang-format off
 using DeviceGemmV2_Streamk_Instance = 
    ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
@@ -50,10 +72,20 @@ using DeviceGemmV2_Streamk_Instance =
        ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
 // clang-format on
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example_streamk_v2.inc"
 int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp
@@ -8,7 +8,7 @@
 using ADataType        = ck::half_t;
 using BDataType        = ck::half_t;
 using AccDataType      = float;
-using CShuffleDataType = ck::half_t;
+using CShuffleDataType = float;
 using CDataType        = ck::half_t;
 using ALayout = Row;
@@ -43,6 +43,17 @@ using DeviceGemmV2_Streamk_Instance =
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example_streamk_v2.inc"
 int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+using ADataType        = ck::f8_t;
+using BDataType        = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using CDataType        = ck::half_t;
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// clang-format off
+using DeviceGemmV2_Streamk_Instance = 
+    ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
+        ALayout,   BLayout,  CLayout,   
+        ADataType,   BDataType,  CDataType,  AccDataType,  CShuffleDataType, 
+        PassThrough, PassThrough, PassThrough, GemmDefault, 
+          256,
+        128, 256, 
+        128, 16, 16,
+        16,   16,
+        4,    8, 
+        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
+        2, 16, 16, 1,
+        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
+        2, 16, 16, 1,
+        1, 2, S<1, 32, 1, 8>, 8,
+        ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3, ck::f8_t>;
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+#include "run_gemm_example_streamk_v2.inc"
+int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); }
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -116,21 +116,21 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        };
    auto f_get_default_stride =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
-            if(stride == 0)
+            if(stride == -1)
            {
-                // give a chance if stride is zero, return a default packed stride
+                // give a chance if stride is -1, return a default packed stride
                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
                {
-                    return col;
+                    return static_cast<std::size_t>(col);
                }
                else
                {
-                    return row;
+                    return static_cast<std::size_t>(row);
                }
            }
            else
-                return stride;
+                return static_cast<std::size_t>(stride);
        };
    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
@@ -143,8 +143,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    switch(config.init_method)
    {
    case 0:
-        ck::utils::FillConstant<ADataType>{static_cast<ADataType>(1.f)}(a_m_k);
+        ck::utils::FillConstant<ADataType>{ck::type_convert<ADataType>(1.f)}(a_m_k);
-        ck::utils::FillConstant<BDataType>{static_cast<BDataType>(1.f)}(b_k_n);
+        ck::utils::FillConstant<BDataType>{ck::type_convert<BDataType>(1.f)}(b_k_n);
        break;
    case 1:
        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
@@ -330,7 +330,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    bool pass = true;
-    if(config.do_verification)
+    if((config.do_verification == 1) || (config.do_verification == 3))
    {
        // CPU verification
        auto ref_gemm    = ReferenceGemmInstance{};
@@ -353,13 +353,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 #else
        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-        pass &= !ck::utils::check_err(c_m_n_device_result,
+        pass &= ck::utils::check_err(c_m_n_device_result,
-                                      c_m_n_host_result,
+                                     c_m_n_host_result,
-                                      "Error: Incorrect results!",
+                                     "Error: Incorrect results!",
-                                      get_rtol<CDataType>(),
+                                     get_rtol<CDataType>(),
-                                      get_atol<CDataType>());
+                                     get_atol<CDataType>());
 #endif
+    }
+    if((config.do_verification == 2) || (config.do_verification == 3))
+    {
        // GPU verification
        auto ref_gemm_gpu    = ReferenceGemmInstanceGPU{};
        auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker();
@@ -381,14 +384,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-        pass &= !ck::utils::check_err(c_m_n_device_result,
+        pass &= ck::utils::check_err(c_m_n_device_result,
-                                      c_m_n_device_ref_result,
+                                     c_m_n_device_ref_result,
-                                      "Error: Incorrect results!",
+                                     "Error: Incorrect results!",
-                                      get_rtol<CDataType>(),
+                                     get_rtol<CDataType>(),
-                                      get_atol<CDataType>());
+                                     get_atol<CDataType>());
    }
-    return !pass;
+    return pass == true;
 }
 bool run_gemm_example(int argc, char* argv[])

--- a/example/01_gemm/run_gemm_example_streamk_v2.inc
+++ b/example/01_gemm/run_gemm_example_streamk_v2.inc
@@ -117,9 +117,9 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    auto f_get_default_stride =
        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
-            if(stride == 0)
+            if(stride == -1)
            {
-                // give a chance if stride is 0, return a default packed stride
+                // give a chance if stride is -1, return a default packed stride
                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
                {
                    return static_cast<std::size_t>(col);
@@ -176,13 +176,13 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_ref_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
-    //Added By Emin
+    // Added By Emin
    // Added By Emin
@@ -201,6 +201,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_ref_buf(sizeof(CDataType) *
+                                   c_m_n_device_ref_result.mDesc.GetElementSpaceSize());
    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
@@ -245,8 +247,15 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        return true;
    }
+    std::size_t workspace_size = gemm.GetWorkSpaceSize(&argument);
+    if(workspace_size != 0)
+    {
+        workspace.Realloc(workspace_size);
+        gemm.SetWorkSpacePointer(&argument, workspace.GetDeviceBuffer());
+    }
    bool pass = true;
-    if(config.do_verification)
+    if((config.do_verification == 1) || (config.do_verification == 3))
    {
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();
@@ -276,6 +285,36 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 #endif
    }
+    if((config.do_verification == 2) || (config.do_verification == 3))
+    {
+        // GPU verification
+        auto ref_gemm_gpu    = ReferenceGemmInstanceGPU{};
+        auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker();
+        auto ref_argument_gpu = ref_gemm_gpu.MakeArgument(
+            static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+            static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+            static_cast<CDataType*>(c_m_n_device_ref_buf.GetDeviceBuffer()),
+            M,
+            N,
+            K,
+            a_element_op,
+            b_element_op,
+            c_element_op);
+        std::cout << "Running verification on GPU." << std::endl;
+        ref_invoker_gpu.Run(ref_argument_gpu, StreamConfig{});
+        c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+        pass &= ck::utils::check_err(c_m_n_device_result,
+                                     c_m_n_device_ref_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
+    }
    if(config.time_kernel)
    {
        ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});

--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
@@ -115,21 +115,21 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        };
    auto f_get_default_stride =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
-            if(stride == 0)
+            if(stride == -1)
            {
-                // give a chance if stride is zero, return a default packed stride
+                // give a chance if stride is -1, return a default packed stride
                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
                {
-                    return col;
+                    return static_cast<std::size_t>(col);
                }
                else
                {
-                    return row;
+                    return static_cast<std::size_t>(row);
                }
            }
            else
-                return stride;
+                return static_cast<std::size_t>(stride);
        };
    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
@@ -228,7 +228,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    }
    bool pass = true;
-    if(config.do_verification)
+    if((config.do_verification == 1) || (config.do_verification == 3))
    {
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();
@@ -261,7 +261,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    if(config.time_kernel)
    {
        ave_time =
-            invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 5, 10, true, 4});
+            invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 50, 100, true, 4});
        std::size_t flop = 2_uz * M * N * K;
        std::size_t num_btype =