Merge remote-tracking branch 'origin/develop' into ck_tile/fav3_fwd_sept

6975cb8f · carlushuang · 33aff2ef · 6834e5ee · 6975cb8f · 6975cb8f
Commit 6975cb8f authored Sep 16, 2024 by carlushuang
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,11 +26,15 @@ set(version 1.1.0)
 project(composable_kernel VERSION ${version} LANGUAGES CXX HIP)
 include(CTest)
+# Usage: for customized Python location cmake -DCK_USE_ALTERNATIVE_PYTHON="/opt/Python-3.8.13/bin/python3.8"
+# CK Codegen requires dataclass which is added in Python 3.7
+# Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04
 if(NOT CK_USE_ALTERNATIVE_PYTHON)
-   find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED)
+   find_package(Python3 3.8 COMPONENTS Interpreter REQUIRED)
 else()
   message("Using alternative python version")
   set(EXTRA_PYTHON_PATH)
+   # this is overly restrictive, we may need to be more flexible on the following
   string(REPLACE "/bin/python3.8" "" EXTRA_PYTHON_PATH "${CK_USE_ALTERNATIVE_PYTHON}")
   message("alternative python path is: ${EXTRA_PYTHON_PATH}")
   find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED)

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -703,7 +703,7 @@ def process_results(Map conf=[:]){
 }
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2; RUN_CK_TILE_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=;RUN_CK_TILE_GEMM_TESTS=true
                                              0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true
                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
@@ -775,9 +775,13 @@ pipeline {
            defaultValue: false,
            description: "Run the grouped conv large cases tests (default: OFF)")
        booleanParam(
-            name: "RUN_CK_TILE_TESTS",
+            name: "RUN_CK_TILE_FMHA_TESTS",
            defaultValue: false,
-            description: "Run the ck_tile tests (default: OFF)")
+            description: "Run the ck_tile FMHA tests (default: OFF)")
+        booleanParam(
+            name: "RUN_CK_TILE_GEMM_TESTS",
+            defaultValue: false,
+            description: "Run the ck_tile GEMM tests (default: OFF)")
        booleanParam(
            name: "BUILD_INSTANCES_ONLY",
            defaultValue: false,
@@ -894,15 +898,15 @@ pipeline {
                }
            }
        }
-        stage("Run CK_TILE Tests")
+        stage("Run CK_TILE_FMHA Tests")
        {
            parallel
            {
-                stage("Run CK_TILE Tests on gfx90a")
+                stage("Run CK_TILE_FMHA Tests on gfx90a")
                {
                    when {
                        beforeAgent true
-                        expression { params.RUN_CK_TILE_TESTS.toBoolean() }
+                        expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
                    }
                    agent{ label rocmnode("gfx90a") }
                    environment{
@@ -917,11 +921,11 @@ pipeline {
                        cleanWs()
                    }
                }
-                stage("Run CK_TILE Tests on gfx942")
+                stage("Run CK_TILE_FMHA Tests on gfx942")
                {
                    when {
                        beforeAgent true
-                        expression { params.RUN_CK_TILE_TESTS.toBoolean() }
+                        expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
                    }
                    agent{ label rocmnode("gfx942") }
                    environment{
@@ -937,6 +941,52 @@ pipeline {
                    }
                }
            }
+        }
+        stage("Run CK_TILE_GEMM Tests")
+        {
+            parallel
+            {
+                stage("Run CK_TILE_GEMM Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
+                                           make -j64 tile_example_gemm_basic && \
+                                           cd ../ &&
+                                           example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
+                   }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+                stage("Run CK_TILE_GEMM Tests on gfx942")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx942") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
+                                           make -j64 tile_example_gemm_basic && \
+                                           cd ../ &&
+                                           example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
+                   }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+            }
        }
 		stage("Build CK and run Tests")
        {

--- a/codegen/test/CMakeLists.txt
+++ b/codegen/test/CMakeLists.txt
@@ -6,6 +6,9 @@ if(NOT INSTANCES_ONLY)
    set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP)
    get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
    add_executable(codegen_test_${BASE_NAME} ${TEST_SRC})
+    if(CK_USE_ALTERNATIVE_PYTHON)
+      target_link_options(codegen_test_${BASE_NAME} PRIVATE -lstdc++fs)
+    endif()
    add_dependencies(codegen codegen_test_${BASE_NAME})
    add_dependencies(tests codegen_test_${BASE_NAME})
    add_dependencies(check codegen_test_${BASE_NAME})

--- a/codegen/test/rtc/include/rtc/compile_kernel.hpp
+++ b/codegen/test/rtc/include/rtc/compile_kernel.hpp
@@ -2,14 +2,14 @@
 #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
 #include <rtc/kernel.hpp>
-#include <filesystem>
+#include <ck/filesystem.hpp>
 #include <string>
 namespace rtc {
 struct src_file
 {
-    std::filesystem::path path;
+    CK::fs::path path;
    std::string_view content;
 };

--- a/codegen/test/rtc/include/rtc/tmp_dir.hpp
+++ b/codegen/test/rtc/include/rtc/tmp_dir.hpp
@@ -2,13 +2,13 @@
 #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
 #include <string>
-#include <filesystem>
+#include <ck/filesystem.hpp>
 namespace rtc {
 struct tmp_dir
 {
-    std::filesystem::path path;
+    CK::fs::path path;
    tmp_dir(const std::string& prefix = "");
    void execute(const std::string& cmd) const;

--- a/codegen/test/rtc/src/compile_kernel.cpp
+++ b/codegen/test/rtc/src/compile_kernel.cpp
@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
    for(const auto& src : srcs)
    {
-        std::filesystem::path full_path   = td.path / src.path;
+        CK::fs::path full_path   = td.path / src.path;
-        std::filesystem::path parent_path = full_path.parent_path();
+        CK::fs::path parent_path = full_path.parent_path();
-        std::filesystem::create_directories(parent_path);
+        CK::fs::create_directories(parent_path);
        write_string(full_path.string(), src.content);
        if(src.path.extension().string() == ".cpp")
        {
@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
    td.execute(compiler() + options.flags);
    auto out_path = td.path / out;
-    if(not std::filesystem::exists(out_path))
+    if(not CK::fs::exists(out_path))
        throw std::runtime_error("Output file missing: " + out);
    auto obj = read_buffer(out_path.string());

--- a/codegen/test/rtc/src/tmp_dir.cpp
+++ b/codegen/test/rtc/src/tmp_dir.cpp
@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix)
 }
 tmp_dir::tmp_dir(const std::string& prefix)
-    : path(std::filesystem::temp_directory_path() /
+    : path(CK::fs::temp_directory_path() /
           unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix))
 {
-    std::filesystem::create_directories(this->path);
+    CK::fs::create_directories(this->path);
 }
 void tmp_dir::execute(const std::string& cmd) const
@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const
    std::system(s.c_str());
 }
-tmp_dir::~tmp_dir() { std::filesystem::remove_all(this->path); }
+tmp_dir::~tmp_dir() { CK::fs::remove_all(this->path); }
 } // namespace rtc
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
 rocm-docs-core==1.7.2
-sphinxcontrib-bibtex==2.6.2
+sphinxcontrib-bibtex==2.6.3
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -137,7 +137,7 @@ sphinx-notfound-page==1.0.3
    # via rocm-docs-core
 sphinxcontrib-applehelp==2.0.0
    # via sphinx
-sphinxcontrib-bibtex==2.6.2
+sphinxcontrib-bibtex==2.6.3
    # via -r requirements.in
 sphinxcontrib-devhelp==2.0.0
    # via sphinx

--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -22,12 +22,20 @@ string(REPLACE ";" "," FMHA_FWD_APIS "${FMHA_FWD_ENABLE_APIS}")
 execute_process(
  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
  --api ${FMHA_FWD_APIS} --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt
+  RESULT_VARIABLE ret
 )
+if(ret AND NOT ret EQUAL 0)
+  message( FATAL_ERROR "CK Tile FMHA FAILED to genrate a list of FWD kernels via Python.")
+endif()
 execute_process(
  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
  --api bwd --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt --receipt 3
+  RESULT_VARIABLE ret
 )
+if(ret AND NOT ret EQUAL 0)
+  message( FATAL_ERROR "CK Tile FMHA FAILED to genrate a list of BWD kernels via Python.")
+endif()
 # NOTE: for cmake, the FMHA_FWD_GEN_BLOBS/FMHA_BWD_GEN_BLOBS files must be in the same directory
 #       as current cmake list, otherwise will not figure out the dependency properly

--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -3,7 +3,7 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "gemm_basic.hpp"
-#include "ck_tile/host.hpp"
+#include <hip/hip_runtime.h>
 #include <cstring>
 #include <iostream>
@@ -21,7 +21,7 @@ auto create_args(int argc, char* argv[])
        .insert("stride_a", "0", "Tensor A stride")
        .insert("stride_b", "0", "Tensor B stride")
        .insert("stride_c", "0", "Tensor C stride")
-        .insert("v", "1", "cpu validation or not")
+        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
        .insert("e", "1e-5", "Absolute error tolerance")
        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
        .insert("warmup", "10", "number of iterations before benchmark the kernel")
@@ -32,41 +32,22 @@ auto create_args(int argc, char* argv[])
    return std::make_tuple(result, arg_parser);
 }
-template <typename LayoutA, typename LayoutB, typename LayoutC>
+template <typename LayoutA,
+          typename LayoutB,
+          typename LayoutC,
+          typename PipelineProblem,
+          typename GemmPipeline,
+          typename GemmShape>
 float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
 {
-    // ToDo: This will be modified by the codegen code later.
-    constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 128;
-    constexpr ck_tile::index_t K_Tile = 32;
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 8;
    // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
    constexpr bool kPadA = true;
    constexpr bool kPadB = true;
-    constexpr bool kPadC = false;
    constexpr int kBlockPerCu = 1;
-    // ===============================================
-    using GemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
    using TilePartitioner = ck_tile::GemmTilePartitioner<GemmShape>;
-    using PipelineProblem = ck_tile::
+    using GemmEpilogue    = ck_tile::Default2DEpilogue<
-        BlockGemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, kPadA, kPadB, kPadC>;
-    // The GemmPipeline should also come from the Codegen.
-    using GemmPipeline = ck_tile::BlockGemmPipelineAGmemBGmemCRegV1<PipelineProblem>;
-    using GemmEpilogue = ck_tile::Default2DEpilogue<
        ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadA, kPadB>>;
    // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
    // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
@@ -93,7 +74,13 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
    return ave_time;
 }
-template <typename DataType, typename LayoutA, typename LayoutB, typename LayoutC>
+template <typename DataType,
+          typename LayoutA,
+          typename LayoutB,
+          typename LayoutC,
+          typename PipelineProblem,
+          typename GemmPipeline,
+          typename GemmShape>
 float invoke_gemm(ck_tile::DeviceMem& a_buf,
                  ck_tile::DeviceMem& b_buf,
                  ck_tile::DeviceMem& c_buf,
@@ -155,7 +142,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_buf,
    else
    {
        args.stride_B = [&]() {
-            if constexpr(std::is_same_v<LayoutB, ck_tile::tensor_layout::gemm::ColumnMajor>)
+            if constexpr(std::is_same_v<LayoutB, ck_tile::tensor_layout::gemm::RowMajor>)
            {
                return N;
            }
@@ -184,8 +171,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_buf,
        }();
    }
-    float ave_time =
+    float ave_time = gemm_calc<LayoutA, LayoutB, LayoutC, PipelineProblem, GemmPipeline, GemmShape>(
-        gemm_calc<LayoutA, LayoutB, LayoutC>(args, ck_tile::stream_config{nullptr, true});
+        args, ck_tile::stream_config{nullptr, true});
    std::size_t num_byte =
        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
    float gb_per_sec = num_byte / 1.E6 / ave_time;
@@ -212,7 +199,7 @@ int main(int argc, char* argv[])
    // The Matrix Multiplication goes with Matrix A (M, K), Matrix B (N, K) = Matrix C (M, N).
    using matrix_a_layout = ck_tile::tensor_layout::gemm::RowMajor;
-    using matrix_b_layout = ck_tile::tensor_layout::gemm::RowMajor;
+    using matrix_b_layout = ck_tile::tensor_layout::gemm::ColumnMajor;
    using matrix_c_layout = ck_tile::tensor_layout::gemm::RowMajor;
    // host verify
@@ -221,7 +208,7 @@ int main(int argc, char* argv[])
            ? std::vector<int>{M, K}
            : std::vector<int>{K, M};
    std::vector<int> b_dimensions =
-        (std::is_same_v<matrix_b_layout, ck_tile::tensor_layout::gemm::RowMajor>)
+        (std::is_same_v<matrix_b_layout, ck_tile::tensor_layout::gemm::ColumnMajor>)
            ? std::vector<int>{N, K}
            : std::vector<int>{K, N};
    std::vector<int> c_dimensions =
@@ -245,12 +232,52 @@ int main(int argc, char* argv[])
    a_buf.ToDevice(a_host.data());
    b_buf.ToDevice(b_host.data());
-    invoke_gemm<ck_tile::half_t, matrix_a_layout, matrix_b_layout, matrix_c_layout>(
+    // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
-        a_buf, b_buf, c_buf, arg_parser);
+    constexpr bool kPadA = true;
+    constexpr bool kPadB = true;
+    constexpr bool kPadC = false;
-    bool pass = true;
+    // This part comes from the Codegen
+    constexpr ck_tile::index_t M_Tile = 128;
+    constexpr ck_tile::index_t N_Tile = 128;
+    constexpr ck_tile::index_t K_Tile = 32;
-    if(arg_parser.get_bool("v"))
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 8;
+    using CodegenGemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+    using CodegenPipelineProblem = ck_tile::BlockGemmPipelineProblem<ADataType,
+                                                                     BDataType,
+                                                                     AccDataType,
+                                                                     CodegenGemmShape,
+                                                                     kPadA,
+                                                                     kPadB,
+                                                                     kPadC>;
+    using CodegenGemmPipeline = ck_tile::BlockGemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+    invoke_gemm<ck_tile::half_t,
+                matrix_a_layout,
+                matrix_b_layout,
+                matrix_c_layout,
+                CodegenPipelineProblem,
+                CodegenGemmPipeline,
+                CodegenGemmShape>(a_buf, b_buf, c_buf, arg_parser);
+    c_buf.FromDevice(c_host_dev.data());
+    bool pass_cpu = true;
+    if(arg_parser.get_int("v") == 1)
    {
        // ToDo: Will Add the Element Op (bias) verification in the future.
        ck_tile::reference_gemm<ADataType,
@@ -261,14 +288,71 @@ int main(int argc, char* argv[])
                                matrix_b_layout,
                                matrix_c_layout>(a_host, b_host, c_host_ref);
-        c_buf.FromDevice(c_host_dev.data());
+        pass_cpu = ck_tile::check_err(c_host_dev, c_host_ref);
+        std::cout << "The CPU veification result is:" << (pass_cpu ? "correct" : "fail")
+                  << std::flush;
+    }
+    bool pass_gpu = true;
+    if(arg_parser.get_int("v") == 2)
+    {
+        ck_tile::index_t stride_a = arg_parser.get_int("stride_a");
+        ck_tile::index_t stride_b = arg_parser.get_int("stride_b");
+        ck_tile::index_t stride_c = arg_parser.get_int("stride_c");
+        if(stride_a == 0)
+        {
+            if constexpr(std::is_same_v<matrix_a_layout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+            {
+                stride_a = M;
+            }
+            else
+            {
+                stride_a = K;
+            }
+        }
+        if(stride_b == 0)
+        {
+            if constexpr(std::is_same_v<matrix_b_layout, ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                stride_b = N;
+            }
+            else
+            {
+                stride_b = K;
+            }
+        }
+        if(stride_c == 0)
+        {
+            if constexpr(std::is_same_v<matrix_c_layout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+            {
+                stride_c = M;
+            }
+            else
+            {
+                stride_c = N;
+            }
+        }
+        ck_tile::HostTensor<CDataType> c_host_gpu_ref(c_dimensions);
+        ck_tile::DeviceMem c_gpu_buf(c_host_gpu_ref.get_element_space_size_in_bytes());
+        ck_tile::reference_gemm_gpu<ADataType, BDataType, AccDataType, CDataType>(
+            a_buf, b_buf, c_gpu_buf, M, N, K, stride_a, stride_b, stride_c);
+        c_buf.FromDevice(c_host_gpu_ref.data());
-        pass = ck_tile::check_err(c_host_dev, c_host_ref);
+        pass_gpu = ck_tile::check_err(c_host_dev, c_host_gpu_ref);
-        std::cout << "The veification result is:" << (pass ? "correct" : "fail") << std::flush;
+        std::cout << "The GPU veification result is:" << (pass_gpu ? "correct" : "fail")
+                  << std::flush;
    }
    std::cout << std::endl << std::flush;
-    return !pass;
+    return !pass_gpu;
 }
--- a/example/ck_tile/03_gemm/gemm_basic.hpp
+++ b/example/ck_tile/03_gemm/gemm_basic.hpp
@@ -8,6 +8,7 @@
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/host.hpp"
 #include <string>
 template <typename DataType>

--- a/example/ck_tile/03_gemm/script/run_full_test.sh
+++ b/example/ck_tile/03_gemm/script/run_full_test.sh
+#!/bin/bash 
+#
+# in order to run this script you'd first need to build the tile_example_gemm executables in ../build/bin/
+#
+# run the script as "./run_full_test.sh <tag for your test environment> <branch name> <host name> <gpu_arch>
+# input arguments: 
+# environment tag  : a string describing the specifics of your test environment
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# host name        : $hostname
+# gpu architecture: e.g., gfx90a, or gfx942, etc.
+# get the command line arguments:
+export env_type=$1
+echo 'Environment type: ' $env_type
+export branch=$2
+echo 'Branch name: ' $branch
+export host_name=$3
+echo 'Host name: ' $host_name
+export GPU_arch=$4
+echo 'GPU_arch: ' $GPU_arch
+# run verification tests
+example/ck_tile/03_gemm/script/smoke_test.sh
+# We do not have a performance benchmark for gemm yet. Will add it in the future.
\ No newline at end of file
--- a/example/ck_tile/03_gemm/script/smoke_test.sh
+++ b/example/ck_tile/03_gemm/script/smoke_test.sh
+#!/bin/bash
+EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)"
+KNAME=1
+export CK_WARMUP=0
+export CK_REPEAT=1
+COMMON_ARGS='-v=2 -warmup=0 -repeat=1'
+run_fp16_tests() {
+    for batch in 1 2; do
+        for m in 128 1024; do
+            for n in 128 2048; do
+                for k in 32 64; do
+                    $EXE -b=$batch -m=$m -n=$n -k=$k -stride_a=0 -stride_b=0 -stride_c=0 -e=1e-5 -prec=fp16 $COMMON_ARGS
+                    if [ $? -eq 0 ]; then
+                        echo "Success: Test with batch=$batch, m=$m, n=$n, k=$k executed successfully."
+                    else
+                        echo "Error: Test with batch=$batch, m=$m, n=$n, k=$k failed to execute properly."
+                        # Optionally, exit or break if you need to halt further execution
+                        # exit 1
+                    fi
+                done
+            done
+        done
+    done
+}
+set -x
+run_fp16_tests
+set +x
\ No newline at end of file
--- a/include/ck/filesystem.hpp
+++ b/include/ck/filesystem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#ifndef GUARD_CK_FILESYSTEM_HPP_
+#define GUARD_CK_FILESYSTEM_HPP_
+#include <string>
+#include <string_view>
+// clang-format off
+#if defined(CPPCHECK)
+  #define CK_HAS_FILESYSTEM 1
+  #define CK_HAS_FILESYSTEM_TS 1
+#elif defined(_WIN32)
+  #if _MSC_VER >= 1920
+    #define CK_HAS_FILESYSTEM 1
+    #define CK_HAS_FILESYSTEM_TS 0
+  #elif _MSC_VER >= 1900
+    #define CK_HAS_FILESYSTEM 0
+    #define CK_HAS_FILESYSTEM_TS 1
+  #else
+    #define CK_HAS_FILESYSTEM 0
+    #define CK_HAS_FILESYSTEM_TS 0
+  #endif
+#elif defined(__has_include)
+  #if __has_include(<filesystem>) && __cplusplus >= 201703L
+    #define CK_HAS_FILESYSTEM 1
+  #else
+    #define CK_HAS_FILESYSTEM 0
+  #endif
+  #if __has_include(<experimental/filesystem>) && __cplusplus >= 201103L
+    #define CK_HAS_FILESYSTEM_TS 1
+  #else
+    #define CK_HAS_FILESYSTEM_TS 0
+  #endif
+#else
+  #define CK_HAS_FILESYSTEM 0
+  #define CK_HAS_FILESYSTEM_TS 0
+#endif
+// clang-format on
+#if CK_HAS_FILESYSTEM
+#include <filesystem>
+#elif CK_HAS_FILESYSTEM_TS
+#include <experimental/filesystem>
+#else
+#error "No filesystem include available"
+#endif
+namespace CK {
+#if CK_HAS_FILESYSTEM
+namespace fs = ::std::filesystem;
+#elif CK_HAS_FILESYSTEM_TS
+namespace fs = ::std::experimental::filesystem;
+#endif
+} // namespace CK
+inline std::string operator+(const std::string_view s, const CK::fs::path& path)
+{
+    return path.string().insert(0, s);
+}
+inline std::string operator+(const CK::fs::path& path, const std::string_view s)
+{
+    return path.string().append(s);
+}
+#define FS_ENUM_PERMS_ALL fs::perms::all
+#if CK_HAS_FILESYSTEM_TS
+#ifdef __linux__
+#include <linux/limits.h>
+namespace CK {
+inline fs::path weakly_canonical(const fs::path& path)
+{
+    std::string result(PATH_MAX, '\0');
+    std::string p{path.is_relative() ? (fs::current_path() / path).string() : path.string()};
+    char* retval = realpath(p.c_str(), &result[0]);
+    return (retval == nullptr) ? path : fs::path{result};
+}
+} // namespace CK
+#else
+#error "Not implmeneted!"
+#endif
+#else
+namespace CK {
+inline fs::path weakly_canonical(const fs::path& path) { return fs::weakly_canonical(path); }
+} // namespace CK
+#endif
+namespace CK {
+#ifdef _WIN32
+constexpr std::string_view executable_postfix{".exe"};
+constexpr std::string_view library_prefix{""};
+constexpr std::string_view dynamic_library_postfix{".dll"};
+constexpr std::string_view static_library_postfix{".lib"};
+constexpr std::string_view object_file_postfix{".obj"};
+#else
+constexpr std::string_view executable_postfix{""};
+constexpr std::string_view library_prefix{"lib"};
+constexpr std::string_view dynamic_library_postfix{".so"};
+constexpr std::string_view static_library_postfix{".a"};
+constexpr std::string_view object_file_postfix{".o"};
+#endif
+inline fs::path make_executable_name(const fs::path& path)
+{
+    return path.parent_path() / (path.filename() + executable_postfix);
+}
+inline fs::path make_dynamic_library_name(const fs::path& path)
+{
+    return path.parent_path() / (library_prefix + path.filename() + dynamic_library_postfix);
+}
+inline fs::path make_object_file_name(const fs::path& path)
+{
+    return path.parent_path() / (path.filename() + object_file_postfix);
+}
+inline fs::path make_static_library_name(const fs::path& path)
+{
+    return path.parent_path() / (library_prefix + path.filename() + static_library_postfix);
+}
+struct FsPathHash
+{
+    std::size_t operator()(const fs::path& path) const { return fs::hash_value(path); }
+};
+} // namespace CK
+#endif // GUARD_CK_FILESYSTEM_HPP_
--- a/include/ck_tile/host/reference/reference_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_gemm.hpp
@@ -5,7 +5,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/host/host_tensor.hpp"
-#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
 #include <thread>
 namespace ck_tile {
@@ -57,4 +57,121 @@ CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
    make_ParallelTensorFunctor(f, M)(std::thread::hardware_concurrency());
 }
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+__global__ void naive_gemm_kernel(ADataType* A,
+                                  BDataType* B,
+                                  CDataType* C,
+                                  ck_tile::index_t M,
+                                  ck_tile::index_t N,
+                                  ck_tile::index_t K,
+                                  ck_tile::index_t strideA,
+                                  ck_tile::index_t strideB,
+                                  ck_tile::index_t strideC)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int row = idx / N; // Compute row index
+    int col = idx % N; // Compute column index
+    if(row < M && col < N)
+    {
+        AccDataType acc = 0.0;
+        for(int k = 0; k < K; ++k)
+        {
+            acc += static_cast<AccDataType>(A[row * strideA + k]) *
+                   static_cast<AccDataType>(B[col * strideB + k]);
+        }
+        C[row * strideC + col] = acc; // Store as AccDataType
+    }
+}
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+void reference_gemm_gpu(DeviceMem& a_device,
+                        DeviceMem& b_device,
+                        DeviceMem& c_device,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t stride_a,
+                        index_t stride_b,
+                        index_t stride_c)
+{
+    ADataType* d_A;
+    BDataType* d_B;
+    CDataType* d_C;
+    hipError_t errA = hipMalloc(&d_A, M * K * sizeof(ADataType));
+    hipError_t errB = hipMalloc(&d_B, N * K * sizeof(BDataType));
+    hipError_t errC = hipMalloc(&d_C, M * N * sizeof(CDataType));
+    if(errA != hipSuccess)
+    {
+        std::cerr << "Error allocating device memory for A: " << hipGetErrorString(errA)
+                  << std::endl;
+        return; // Early exit on error
+    }
+    if(errB != hipSuccess)
+    {
+        std::cerr << "Error allocating device memory for B: " << hipGetErrorString(errB)
+                  << std::endl;
+        return; // Early exit on error
+    }
+    if(errC != hipSuccess)
+    {
+        std::cerr << "Error allocating device memory for C: " << hipGetErrorString(errC)
+                  << std::endl;
+        return; // Early exit on error
+    }
+    errA = hipMemcpy(
+        d_A, a_device.GetDeviceBuffer(), M * K * sizeof(ADataType), hipMemcpyHostToDevice);
+    if(errA != hipSuccess)
+    {
+        std::cerr << "Error copying A to device: " << hipGetErrorString(errA) << std::endl;
+    }
+    errB = hipMemcpy(
+        d_B, b_device.GetDeviceBuffer(), N * K * sizeof(BDataType), hipMemcpyHostToDevice);
+    if(errB != hipSuccess)
+    {
+        std::cerr << "Error copying B to device: " << hipGetErrorString(errB) << std::endl;
+    }
+    int totalElements      = M * N;
+    int numThreadsPerBlock = 256; // Common choice for threads per block
+    int numBlocks          = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
+    naive_gemm_kernel<ADataType, BDataType, AccDataType, CDataType>
+        <<<numBlocks, numThreadsPerBlock>>>(d_A, d_B, d_C, M, N, K, stride_a, stride_b, stride_c);
+    errC = hipMemcpy(
+        c_device.GetDeviceBuffer(), d_C, M * N * sizeof(CDataType), hipMemcpyDeviceToHost);
+    if(errC != hipSuccess)
+    {
+        std::cerr << "Error copying C to device: " << hipGetErrorString(errC) << std::endl;
+    }
+    errA = hipFree(d_A);
+    if(errA != hipSuccess)
+    {
+        std::cerr << "Error free the A memory: " << hipGetErrorString(errA) << std::endl;
+    }
+    errB = hipFree(d_B);
+    if(errB != hipSuccess)
+    {
+        std::cerr << "Error free the B memory: " << hipGetErrorString(errB) << std::endl;
+    }
+    errC = hipFree(d_C);
+    if(errC != hipSuccess)
+    {
+        std::cerr << "Error free the C memory: " << hipGetErrorString(errC) << std::endl;
+    }
+    return;
+}
 } // namespace ck_tile
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -76,8 +76,7 @@ struct GemmKernel
    CK_TILE_DEVICE void operator()(GemmCommonKargs kargs) const
    {
-        const index_t i_m = TilePartitioner::iM;
+        const auto [i_m, i_n] = TilePartitioner{}();
-        const index_t i_n = TilePartitioner::iN;
        // options
        const ADataType* a_start = static_cast<const ADataType*>(kargs.a_ptr);
        const BDataType* b_start = static_cast<const BDataType*>(kargs.b_ptr);
@@ -104,7 +103,7 @@ struct GemmKernel
        }();
        auto b_tensor_view = [&]() {
-            if constexpr(std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
+            if constexpr(std::is_same_v<LayoutB, tensor_layout::gemm::RowMajor>)
            {
                return make_naive_tensor_view<address_space_enum::global>(
                    b_start,

--- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
@@ -15,9 +15,6 @@ struct GemmTilePartitioner
    static constexpr ck_tile::index_t kN = BlockGemmShape::kN;
    static constexpr ck_tile::index_t kK = BlockGemmShape::kK;
-    const index_t iM = __builtin_amdgcn_readfirstlane(i_tile_m * kM);
-    const index_t iN = __builtin_amdgcn_readfirstlane(i_tile_n * kN);
    CK_TILE_HOST static constexpr auto
    GridSize(ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t batch_size)
    {
@@ -29,10 +26,9 @@ struct GemmTilePartitioner
    CK_TILE_DEVICE auto operator()()
    {
-        const index_t i_GridDimX = blockIdx.x;
+        const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kM);
-        const index_t i_GridDimY = blockIdx.y;
+        const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kN);
-        const index_t i_GridDimZ = blockIdx.z;
+        return ck_tile::make_tuple(iM, iN);
-        return ck_tile::make_tuple(i_GridDimX, i_GridDimY, i_GridDimZ);
    }
 };
 } // namespace ck_tile
--- a/library/include/ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp
@@ -23,6 +23,10 @@ void add_device_maxpool_bwd_bf16_instances(
 void add_device_maxpool_bwd_f32_instances(
    std::vector<std::unique_ptr<DeviceMaxPoolBwd<F32, I32, F32>>>&);
 #endif
+#ifdef CK_ENABLE_FP8
+void add_device_maxpool_bwd_f8_instances(
+    std::vector<std::unique_ptr<DeviceMaxPoolBwd<F8, I32, F8>>>&);
+#endif
 #ifdef CK_ENABLE_INT8
 void add_device_maxpool_bwd_int8_instances(
    std::vector<std::unique_ptr<DeviceMaxPoolBwd<I8, I32, I8>>>&);
@@ -53,6 +57,11 @@ struct DeviceOperationInstanceFactory<
                          is_same_v<IndexDataType, I32>)
            add_device_maxpool_bwd_f32_instances(op_ptrs);
 #endif
+#ifdef CK_ENABLE_FP8
+        else if constexpr(is_same_v<DOutDataType, F8> && is_same_v<DInDataType, F8> &&
+                          is_same_v<IndexDataType, I32>)
+            add_device_maxpool_bwd_f8_instances(op_ptrs);
+#endif
 #ifdef CK_ENABLE_INT8
        else if constexpr(is_same_v<DOutDataType, I8> && is_same_v<DInDataType, I8> &&
                          is_same_v<IndexDataType, I32>)

--- a/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
@@ -67,6 +67,36 @@ void add_device_pool2d_fwd_nhwc_index_f32_instances(
    std::vector<std::unique_ptr<
        DevicePoolFwd<InOutRank, WindowRank, F32, F32, I32, NHWC, NHWC, MaxOp, true>>>&);
 #endif
+#ifdef CK_ENABLE_INT8
+// I8
+void add_device_pool2d_fwd_nhwc_i8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, I8, I8, I32, NHWC, NHWC, MaxOp, false>>>&);
+void add_device_pool2d_fwd_nhwc_i8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, I8, I8, I32, NHWC, NHWC, AvgOp, false>>>&);
+// I8 - return index
+void add_device_pool2d_fwd_nhwc_index_i8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, I8, I8, I32, NHWC, NHWC, MaxOp, true>>>&);
+#endif
+#ifdef CK_ENABLE_FP8
+// F8
+void add_device_pool2d_fwd_nhwc_f8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, F8, F8, I32, NHWC, NHWC, MaxOp, false>>>&);
+void add_device_pool2d_fwd_nhwc_f8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, F8, F8, I32, NHWC, NHWC, AvgOp, false>>>&);
+// F8 - return index
+void add_device_pool2d_fwd_nhwc_index_f8_instances(
+    std::vector<std::unique_ptr<
+        DevicePoolFwd<InOutRank, WindowRank, F8, F8, I32, NHWC, NHWC, MaxOp, true>>>&);
+#endif
 template <typename InDataType,
          typename OutDataType,
          typename IndexDataType,
@@ -140,6 +170,34 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
                    add_device_pool2d_fwd_nhwc_f32_instances(op_ptrs);
                }
            }
+#endif
+#ifdef CK_ENABLE_INT8
+            else if constexpr(is_same_v<InDataType, I8> && is_same_v<OutDataType, I8> &&
+                              is_same_v<IndexDataType, I32>)
+            {
+                if constexpr(OutputIndex && ReduceOpId == MaxOp)
+                {
+                    add_device_pool2d_fwd_nhwc_index_i8_instances(op_ptrs);
+                }
+                else
+                {
+                    add_device_pool2d_fwd_nhwc_i8_instances(op_ptrs);
+                }
+            }
+#endif
+#ifdef CK_ENABLE_FP8
+            else if constexpr(is_same_v<InDataType, F8> && is_same_v<OutDataType, F8> &&
+                              is_same_v<IndexDataType, I32>)
+            {
+                if constexpr(OutputIndex && ReduceOpId == MaxOp)
+                {
+                    add_device_pool2d_fwd_nhwc_index_f8_instances(op_ptrs);
+                }
+                else
+                {
+                    add_device_pool2d_fwd_nhwc_f8_instances(op_ptrs);
+                }
+            }
 #endif
        }