Merge branch 'develop' into transpose_5d

e1a5137e · arai713 · GitHub · eb57178d · 718065eb · e1a5137e
Unverified Commit e1a5137e authored Sep 19, 2023 by arai713 Committed by GitHub Sep 19, 2023
20 changed files
--- a/profiler/src/profile_gemm_bilinear.cpp
+++ b/profiler/src/profile_gemm_bilinear.cpp
@@ -71,6 +71,9 @@ int profile_gemm_bilinear(int argc, char* argv[])
    using F16 = ck::half_t;
    using F32 = float;

+    using I8  = std::int8_t;
+    using I32 = std::int32_t;
+
    using Row = ck::tensor_layout::gemm::RowMajor;
    using Col = ck::tensor_layout::gemm::ColumnMajor;

@@ -141,6 +144,22 @@ int profile_gemm_bilinear(int argc, char* argv[])
    {
        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, Col{}, Col{}, Row{}, Row{});
    }
+    else if(data_type == MatrixDataType::INT8_INT8_INT8_INT8 && layout == MatrixLayout::MK_KN_MN_MN)
+    {
+        return profile(I8{}, I8{}, I32{}, I8{}, I8{}, Row{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::INT8_INT8_INT8_INT8 && layout == MatrixLayout::MK_NK_MN_MN)
+    {
+        return profile(I8{}, I8{}, I32{}, I8{}, I8{}, Row{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::INT8_INT8_INT8_INT8 && layout == MatrixLayout::KM_KN_MN_MN)
+    {
+        return profile(I8{}, I8{}, I32{}, I8{}, I8{}, Col{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::INT8_INT8_INT8_INT8 && layout == MatrixLayout::KM_NK_MN_MN)
+    {
+        return profile(I8{}, I8{}, I32{}, I8{}, I8{}, Col{}, Col{}, Row{}, Row{});
+    }
    else
    {
        std::cout << "this data_type & layout is not implemented" << std::endl;

--- a/profiler/src/profile_gemm_multiply_add.cpp
+++ b/profiler/src/profile_gemm_multiply_add.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_multiply_add_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_multiply_add"
+#define OP_DESC "GEMM+MULTIPLY+ADD"
+
+int profile_gemm_multiply_add(int argc, char* argv[])
+{
+    enum struct MatrixLayout
+    {
+        MK_KN_MN_MN_MN, // 0
+        MK_NK_MN_MN_MN, // 1
+    };
+
+    enum struct MatrixDataType
+    {
+        F16_F16_F16_F16_F16, // 0
+        F16_F8_F32_F32_F16,  // 1
+    };
+
+    if(argc != 16)
+    {
+        // clang-format off
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp16; 1: fp16Afp8B)\n");
+        printf("arg3: matrix layout (0: E[m, n] = Multiply_Add((A[m, k] * B[k, n]) x D1[m, n] + D0[m, n]);\n");
+        printf("                     1: E[m, n] = Multiply_Add((A[m, k] * B[n, k]) x D1[m, n] + D0[m, n]);\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 15: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
+        // clang-format on
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<MatrixDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<MatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA  = std::stoi(argv[11]);
+    const int StrideB  = std::stoi(argv[12]);
+    const int StrideD0 = std::stoi(argv[13]);
+    const int StrideD1 = std::stoi(argv[14]);
+    const int StrideE  = std::stoi(argv[15]);
+
+    using F16 = ck::half_t;
+    using F32 = float;
+#if defined CK_ENABLE_FP8
+    using F8 = ck::f8_t;
+#endif
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a_type,
+                       auto b_type,
+                       auto acc_type,
+                       auto d0_type,
+                       auto d1_type,
+                       auto e_type,
+                       auto a_layout,
+                       auto b_layout,
+                       auto d0_layout,
+                       auto d1_layout,
+                       auto e_layout) {
+        using ADataType   = decltype(a_type);
+        using BDataType   = decltype(b_type);
+        using AccDataType = decltype(acc_type);
+        using D0DataType  = decltype(d0_type);
+        using D1DataType  = decltype(d1_type);
+        using EDataType   = decltype(e_type);
+
+        using ALayout  = decltype(a_layout);
+        using BLayout  = decltype(b_layout);
+        using D0Layout = decltype(d0_layout);
+        using D1Layout = decltype(d1_layout);
+        using ELayout  = decltype(e_layout);
+
+        const int DefaultStrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB  = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
+        const int DefaultStrideD1 = ck::is_same_v<D1Layout, Row> ? N : M;
+        const int DefaultStrideE  = ck::is_same_v<ELayout, Row> ? N : M;
+
+        bool pass = ck::profiler::profile_gemm_multiply_add_impl<ADataType,
+                                                                 BDataType,
+                                                                 AccDataType,
+                                                                 D0DataType,
+                                                                 D1DataType,
+                                                                 EDataType,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 D0Layout,
+                                                                 D1Layout,
+                                                                 ELayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideD0 < 0) ? DefaultStrideD0 : StrideD0,
+            (StrideD1 < 0) ? DefaultStrideD1 : StrideD1,
+            (StrideE < 0) ? DefaultStrideE : StrideE);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == MatrixDataType::F16_F16_F16_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 &&
+            layout == MatrixLayout::MK_NK_MN_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Col{}, Row{}, Row{}, Row{});
+    }
+#if defined CK_ENABLE_FP8
+    else if(data_type == MatrixDataType::F16_F8_F32_F32_F16 &&
+            layout == MatrixLayout::MK_KN_MN_MN_MN)
+    {
+        return profile(F16{}, F8{}, F32{}, F32{}, F32{}, F16{}, Row{}, Row{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F8_F32_F32_F16 &&
+            layout == MatrixLayout::MK_NK_MN_MN_MN)
+    {
+        return profile(F16{}, F8{}, F32{}, F32{}, F32{}, F16{}, Row{}, Col{}, Row{}, Row{}, Row{});
+    }
+#endif
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_multiply_add);
--- a/profiler/src/profile_gemm_splitk.cpp
+++ b/profiler/src/profile_gemm_splitk.cpp
@@ -23,6 +23,8 @@ enum struct GemmDataType
    F16_F16_F16,    // 1
    BF16_BF16_BF16, // 2
    INT8_INT8_INT8, // 3
+    F8_F16_F16,     // 4
+    F16_F8_F16,     // 5
 };

 #define OP_NAME "gemm_splitk"
@@ -33,7 +35,7 @@ int profile_gemm_splitk(int argc, char* argv[])
    if(argc != 15)
    {
        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
-        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8)\n");
        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
@@ -65,6 +67,9 @@ int profile_gemm_splitk(int argc, char* argv[])

    using F32 = float;
    using F16 = ck::half_t;
+#if defined CK_ENABLE_FP8
+    using F8 = ck::f8_t;
+#endif

    using Row = ck::tensor_layout::gemm::RowMajor;
    using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -143,6 +148,40 @@ int profile_gemm_splitk(int argc, char* argv[])
    {
        return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{});
    }
+#if defined CK_ENABLE_FP8
+    else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(F8{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(F8{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(F16{}, F8{}, F32{}, F16{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(F16{}, F8{}, F32{}, F16{}, Col{}, Col{}, Row{});
+    }
+#endif
    else
    {
        std::cout << "this data_type & layout is not implemented" << std::endl;

--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -88,7 +88,7 @@ int profile_grouped_gemm(int argc, char* argv[])
    const auto StrideBs = argToIntArray(argv[12]);
    const auto StrideCs = argToIntArray(argv[13]);
    const int kbatch    = argc == 15 ? std::stoi(argv[14]) : 1;
-#ifdef __fp16__
+#ifdef CK_ENABLE_FP16
    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        ck::profiler::profile_grouped_gemm_impl<ck::half_t,

--- a/profiler/src/profile_image_to_column.cpp
+++ b/profiler/src/profile_image_to_column.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_image_to_column_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+namespace {
+
+enum struct ConvLayout
+{
+    NHWC, // 0
+};
+
+enum struct DataType
+{
+    F32_F32,   // 0
+    F16_F16,   // 1
+    BF16_BF16, // 2
+    INT8_INT8, // 3
+};
+
+#define OP_NAME "image_to_column"
+#define OP_DESC "Image To Column"
+
+static void print_helper_msg()
+{
+    std::cout
+        // clang-format off
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+        << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
+        << "                 1: Input fp16, Weight fp16, Output fp16\n"
+        << "                 2: Input bf16, Weight bf16, Output bf16\n"
+        << "                 3: Input int8, Weight int8, Output int8)\n"
+        << "arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])\n"
+        << "arg4: verification (0: no, 1: yes)\n"
+        << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+        << "arg6: print tensor value (0: no; 1: yes)\n"
+        << "arg7: time kernel (0: no, 1: yes)\n"
+        << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+    // clang-format on
+}
+
+} // namespace
+
+int profile_image_to_column(int argc, char* argv[])
+{
+    // 8 for control, 1 for num_dim_spatial
+    if(argc < 9)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto data_type       = static_cast<DataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<ConvLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+    const int num_dim_spatial  = std::stoi(argv[8]);
+
+    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
+    if(argc != 8 + 1 + 4 + 6 * num_dim_spatial)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
+
+    using F32  = float;
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
+    using INT8 = int8_t;
+
+    using namespace ck::tensor_layout::convolution;
+
+    constexpr auto I1 = ck::Number<1>{};
+    constexpr auto I2 = ck::Number<2>{};
+    constexpr auto I3 = ck::Number<3>{};
+
+    auto profile = [&](auto num_dim_spatial_tmp, auto in_layout, auto in_type, auto out_type) {
+        constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
+
+        using InLayout = decltype(in_layout);
+
+        using InDataType  = decltype(in_type);
+        using OutDataType = decltype(out_type);
+
+        bool pass = ck::profiler::
+            profile_image_to_column_impl<NDimSpatial, InLayout, InDataType, OutDataType>(
+                do_verification, init_method, do_log, time_kernel, params);
+
+        return pass ? 0 : 1;
+    };
+
+    // NHWC
+    if(layout == ConvLayout::NHWC)
+    {
+        if(num_dim_spatial == 1)
+        {
+            if(data_type == DataType::F32_F32)
+            {
+                return profile(I1, GNWC{}, F32{}, F32{});
+            }
+            else if(data_type == DataType::F16_F16)
+            {
+                return profile(I1, GNWC{}, F16{}, F16{});
+            }
+            else if(data_type == DataType::BF16_BF16)
+            {
+                return profile(I1, GNWC{}, BF16{}, BF16{});
+            }
+            else if(data_type == DataType::INT8_INT8)
+            {
+                return profile(I1, GNWC{}, INT8{}, INT8{});
+            }
+        }
+        else if(num_dim_spatial == 2)
+        {
+            if(data_type == DataType::F32_F32)
+            {
+                return profile(I2, GNHWC{}, F32{}, F32{});
+            }
+            else if(data_type == DataType::F16_F16)
+            {
+                return profile(I2, GNHWC{}, F16{}, F16{});
+            }
+            else if(data_type == DataType::BF16_BF16)
+            {
+                return profile(I2, GNHWC{}, BF16{}, BF16{});
+            }
+            else if(data_type == DataType::INT8_INT8)
+            {
+                return profile(I2, GNHWC{}, INT8{}, INT8{});
+            }
+        }
+        else if(num_dim_spatial == 3)
+        {
+            if(data_type == DataType::F32_F32)
+            {
+                return profile(I3, GNDHWC{}, F32{}, F32{});
+            }
+            else if(data_type == DataType::F16_F16)
+            {
+                return profile(I3, GNDHWC{}, F16{}, F16{});
+            }
+            else if(data_type == DataType::BF16_BF16)
+            {
+                return profile(I3, GNDHWC{}, BF16{}, BF16{});
+            }
+            else if(data_type == DataType::INT8_INT8)
+            {
+                return profile(I3, GNDHWC{}, INT8{}, INT8{});
+            }
+        }
+    }
+
+    std::cout << "this data_type & layout is not implemented" << std::endl;
+
+    return 1;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_image_to_column);
--- a/profiler/src/profile_max_pool3d_bwd.cpp
+++ b/profiler/src/profile_max_pool3d_bwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+
+#include "profiler/data_type_enum.hpp"
+#include "profiler/profile_max_pool3d_bwd_impl.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "profiler_operation_registry.hpp"
+
+using ck::index_t;
+
+struct maxPoolbwdArgParser
+{
+    std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
+                                                                   {"wsize", {}},
+                                                                   {"wstride", {}},
+                                                                   {"wdilation", {}},
+                                                                   {"pad1", {}},
+                                                                   {"pad2", {}}};
+
+    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
+    {
+        if(std::string("--") + key == argv[i])
+        {
+            int pos = i;
+            while(++i < argc && argv[i][0] != '-') {}
+            int end = i;
+            for(int j = pos + 1; j < end; j++)
+            {
+                long_opts[key].push_back(std::stoi(argv[j]));
+            }
+            return true;
+        }
+        return false;
+    }
+
+    void operator()(int argc, char* argv[])
+    {
+        for(auto& kv : long_opts)
+        {
+            for(int i = 1; i < argc; i++)
+            {
+                if(parse_opt(argc, argv, kv.first, i))
+                    break;
+            }
+        }
+    }
+};
+
+void print_help_max_pool3d_bwd()
+{
+    std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
+              << "arg2: verification (0: no; 1: yes)\n"
+              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg4: print tensor value (0: no; 1: yes)\n"
+              << "arg5: time kernel (0=no, 1=yes)\n"
+              << "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
+              << "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
+              << "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
+              << "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
+              << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
+              << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
+              << "eg: ckProfiler max_pool3d_bwd 0 1 2 0 1 --length 2 32 30 30 30 --wsize 2 2 2 "
+                 "--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
+              << std::endl;
+}
+
+int profile_max_pool3d_bwd(int argc, char* argv[])
+{
+    ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
+    bool do_verification       = true;
+    int init_method            = 0;
+    bool do_log                = false;
+    bool time_kernel           = true;
+
+    std::vector<index_t> in_length = {2, 32, 30, 30, 30};
+    std::vector<index_t> wsize     = {2, 2, 2};
+    std::vector<index_t> wstride   = {2, 2, 2};
+    std::vector<index_t> wdilation = {1, 1, 1};
+    std::vector<index_t> pad1      = {1, 1, 1};
+    std::vector<index_t> pad2      = {1, 1, 1};
+
+    if(argc != 2 && argc != 33)
+    {
+        print_help_max_pool3d_bwd();
+        return 0;
+    }
+    else if(argc == 33)
+    {
+        data_type       = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
+        do_verification = std::stoi(argv[3]);
+        init_method     = std::stoi(argv[4]);
+        do_log          = std::stoi(argv[5]);
+        time_kernel     = std::stoi(argv[6]);
+
+        // parse the long options
+        maxPoolbwdArgParser arg_parser;
+        arg_parser(argc, argv);
+        in_length = arg_parser.long_opts["length"];
+        wsize     = arg_parser.long_opts["wsize"];
+        wstride   = arg_parser.long_opts["wstride"];
+        wdilation = arg_parser.long_opts["wdilation"];
+        pad1      = arg_parser.long_opts["pad1"];
+        pad2      = arg_parser.long_opts["pad2"];
+    }
+
+#ifdef CK_ENABLE_FP16
+    using F16 = ck::half_t;
+#endif
+#ifdef CK_ENABLE_BF16
+    using BF16 = ck::bhalf_t;
+#endif
+#ifdef CK_ENABLE_FP32
+    using F32 = float;
+#endif
+    using I32 = int32_t;
+
+    if(false)
+        ;
+#ifdef CK_ENABLE_FP16
+    else if(data_type == ck::DataTypeEnum::Half)
+    {
+        ck::profiler::profile_max_pool3d_bwd_impl<F16, F16, I32, F16, F16, false>(do_verification,
+                                                                                  init_method,
+                                                                                  do_log,
+                                                                                  time_kernel,
+                                                                                  in_length,
+                                                                                  wsize,
+                                                                                  wstride,
+                                                                                  wdilation,
+                                                                                  pad1,
+                                                                                  pad2);
+    }
+#endif
+#ifdef CK_ENABLE_BF16
+    else if(data_type == ck::DataTypeEnum::BFloat16)
+    {
+        ck::profiler::profile_max_pool3d_bwd_impl<BF16, BF16, I32, BF16, BF16, false>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            in_length,
+            wsize,
+            wstride,
+            wdilation,
+            pad1,
+            pad2);
+    }
+#endif
+#ifdef CK_ENABLE_FP32
+    else if(data_type == ck::DataTypeEnum::Float)
+    {
+        ck::profiler::profile_max_pool3d_bwd_impl<F32, F32, I32, F32, F32, false>(do_verification,
+                                                                                  init_method,
+                                                                                  do_log,
+                                                                                  time_kernel,
+                                                                                  in_length,
+                                                                                  wsize,
+                                                                                  wstride,
+                                                                                  wdilation,
+                                                                                  pad1,
+                                                                                  pad2);
+    }
+#endif
+    else
+    {
+        throw std::runtime_error("not implemented yet");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION("max_pool3d_bwd", "max_pool3d bwd", profile_max_pool3d_bwd);
--- a/profiler/src/profile_max_pool3d_fwd.cpp
+++ b/profiler/src/profile_max_pool3d_fwd.cpp
@@ -51,7 +51,7 @@ struct maxPoolFwdArgParser

 void print_help_max_pool3d_fwd()
 {
-    std::cout << "arg1: data type (0: fp16; 1: fp32)\n"
+    std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
              << "arg2: verification (0: no; 1: yes)\n"
              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
              << "arg4: print tensor value (0: no; 1: yes)\n"
@@ -109,8 +109,15 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
        pad2      = arg_parser.long_opts["pad2"];
    }

-    using F16   = ck::half_t;
-    using F32   = float;
+#ifdef CK_ENABLE_FP16
+    using F16 = ck::half_t;
+#endif
+#ifdef CK_ENABLE_BF16
+    using BF16 = ck::bhalf_t;
+#endif
+#ifdef CK_ENABLE_FP32
+    using F32 = float;
+#endif
    using I32   = int32_t;
    using NDHWC = ck::tensor_layout::convolution::NDHWC;

@@ -120,7 +127,10 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
    constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
 #endif

-    if(data_type == ck::DataTypeEnum::Half)
+    if(false)
+        ;
+#ifdef CK_ENABLE_FP16
+    else if(data_type == ck::DataTypeEnum::Half)
    {
        if(return_index)
            ck::profiler::
@@ -149,6 +159,51 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
                    pad1,
                    pad2);
    }
+#endif
+#ifdef CK_ENABLE_BF16
+    else if(data_type == ck::DataTypeEnum::BFloat16)
+    {
+        if(return_index)
+            ck::profiler::profile_pool3d_fwd_impl<BF16,
+                                                  BF16,
+                                                  BF16,
+                                                  I32,
+                                                  NDHWC,
+                                                  NDHWC,
+                                                  ReduceOpId,
+                                                  false,
+                                                  true>(do_verification,
+                                                        init_method,
+                                                        do_log,
+                                                        time_kernel,
+                                                        in_length,
+                                                        wsize,
+                                                        wstride,
+                                                        wdilation,
+                                                        pad1,
+                                                        pad2);
+        else
+            ck::profiler::profile_pool3d_fwd_impl<BF16,
+                                                  BF16,
+                                                  BF16,
+                                                  I32,
+                                                  NDHWC,
+                                                  NDHWC,
+                                                  ReduceOpId,
+                                                  false,
+                                                  false>(do_verification,
+                                                         init_method,
+                                                         do_log,
+                                                         time_kernel,
+                                                         in_length,
+                                                         wsize,
+                                                         wstride,
+                                                         wdilation,
+                                                         pad1,
+                                                         pad2);
+    }
+#endif
+#ifdef CK_ENABLE_FP32
    else if(data_type == ck::DataTypeEnum::Float)
    {
        if(return_index)
@@ -178,6 +233,7 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
                    pad1,
                    pad2);
    }
+#endif
    else
    {
        throw std::runtime_error("not implemented yet");

--- a/script/clang-format-overwrite.sh
+++ b/script/clang-format-overwrite.sh
-#find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
-git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
+#find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}'
+git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}'
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -16,4 +16,3 @@ cmake
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}
-
--- a/script/install_precommit.sh
+++ b/script/install_precommit.sh
@@ -11,7 +11,7 @@ run_and_check() {
 }

 echo "I: Installing tools required for pre-commit checks..."
-run_and_check apt install clang-format-10
+run_and_check apt install clang-format-12

 echo "I: Installing pre-commit itself..."
 run_and_check pip3 install pre-commit

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -57,9 +57,10 @@ add_subdirectory(data_type)
 add_subdirectory(elementwise_normalization)
 add_subdirectory(batchnorm)
 add_subdirectory(contraction)
-add_subdirectory(pool_fwd)
+add_subdirectory(pool)
 add_subdirectory(batched_gemm_multi_d)
 add_subdirectory(grouped_convnd_bwd_data)
+add_subdirectory(image_to_column)
 if(GPU_TARGETS MATCHES "gfx11")
    add_subdirectory(wmma_op)
 endif()
--- a/test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp
+++ b/test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp
@@ -71,6 +71,6 @@ TYPED_TEST_SUITE(TestBatchedGemmMultiD, KernelTypes);
 #ifdef __fp16
 TYPED_TEST(TestBatchedGemmMultiD, f16) { this->template Run<F16>(); }
 #endif
-#ifdef __int8__
+#ifdef CK_ENABLE_INT8
 TYPED_TEST(TestBatchedGemmMultiD, int8) { this->template Run<int8_t>(); }
 #endif
--- a/test/contraction/test_contraction_interface.cpp
+++ b/test/contraction/test_contraction_interface.cpp
@@ -38,7 +38,7 @@ class ContractionInstanceWrapper
        //#####################################|        |        |        |  Type|  Type|    Type| DataType|           Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|               SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|               SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl|                 ScalarPerVector|
        //#####################################|        |        |        |      |      |        |         |               |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |                           |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |                           |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|                   _NWaveNPerXdl|
        //#####################################|        |        |        |      |      |        |         |               |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |                           |               |               |          |                |               |               |                           |               |               |          |            |            |                             |                                |
-        DeviceContractionMultipleD_Xdl_CShuffle<  NumDim,  NumDim,  NumDim,   F32,   F32,     F32,      F32, ck::Tuple<F32>,   F32,         Pass,        Pass,     Bilinear,       GemmSpec,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>, ABlockTransferSrcVectorDim,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>, BBlockTransferSrcVectorDim,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>, CDEBlockTransferScalarPerVector>;
+        DeviceContractionMultipleD_Xdl_CShuffle<  NumDim,  NumDim,  NumDim,   F32,   F32,     F32,      F32, ck::Tuple<F32>,   F32,         Pass,        Pass,     Bilinear,       GemmSpec,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>, ABlockTransferSrcVectorDim,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>, BBlockTransferSrcVectorDim,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>, CDEBlockTransferScalarPerVector>;
    // clang-format on

    bool isSupported(std::vector<ck::index_t>& ADims,

--- a/test/data_type/CMakeLists.txt
+++ b/test/data_type/CMakeLists.txt
@@ -3,5 +3,12 @@ if (USE_BITINT_EXTENSION_INT4)
  target_link_libraries(test_int4 PRIVATE utility)
 endif()

-add_gtest_executable(test_fp8 fp8.cpp)
-target_link_libraries(test_fp8 PRIVATE utility)
+if(DTYPES MATCHES "fp8" OR NOT DEFINED DTYPES)
+  add_gtest_executable(test_f8 f8.cpp)
+  target_link_libraries(test_f8 PRIVATE utility)
+endif()
+
+if(DTYPES MATCHES "bf8" OR NOT DEFINED DTYPES)
+  add_gtest_executable(test_bf8 bf8.cpp)
+  target_link_libraries(test_bf8 PRIVATE utility)
+endif()
--- a/test/data_type/bf8.cpp
+++ b/test/data_type/bf8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/type_convert.hpp"
+
+using ck::bf8_t;
+using ck::f8_convert_sr;
+using ck::half_t;
+using ck::type_convert;
+
+TEST(BF8, NumericLimits)
+{
+    // constants given for negative zero nan mode
+    EXPECT_EQ(ck::NumericLimits<bf8_t>::Min(), type_convert<bf8_t>(0x04));
+    EXPECT_EQ(ck::NumericLimits<bf8_t>::Max(), type_convert<bf8_t>(0x7F));
+    EXPECT_EQ(ck::NumericLimits<bf8_t>::Lowest(), type_convert<bf8_t>(0xFF));
+    EXPECT_EQ(ck::NumericLimits<bf8_t>::QuietNaN(), type_convert<bf8_t>(0x80));
+}
+
+TEST(BF8, ConvertFP32Nearest)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-6;
+    // convert 0 float to bf8 and back, check if holds
+    ASSERT_NEAR(0.0f, type_convert<float>(type_convert<bf8_t>(0.0f)), abs_tol);
+    // convert minimal float to bf8 and back, check if holds
+    ASSERT_NEAR(std::numeric_limits<float>::min(),
+                type_convert<float>(type_convert<bf8_t>(std::numeric_limits<float>::min())),
+                abs_tol);
+    // convert maximal bf8_t to float and check if equal to 57344.0
+    ASSERT_NEAR(57344.0f, type_convert<float>(type_convert<bf8_t>(57344.0f)), abs_tol);
+    // convert maximal float to bf8 and back, check if clipped to 57344.0
+    ASSERT_NEAR(57344.0f,
+                type_convert<float>(type_convert<bf8_t>(std::numeric_limits<float>::max())),
+                abs_tol);
+    // convert inf float to bf8_t and check if it is qNan
+    ASSERT_NEAR(type_convert<bf8_t>(0x80),
+                type_convert<bf8_t>(std::numeric_limits<float>::infinity()),
+                abs_tol);
+    // positive norm float value to bf8 and back, check if holds
+    float pos_float = 0.0000762939f;
+    ASSERT_NEAR(pos_float, type_convert<float>(type_convert<bf8_t>(pos_float)), abs_tol);
+    // negative norm float value to bf8 and back, check if holds
+    float neg_float = -0.0000610351f;
+    ASSERT_NEAR(neg_float, type_convert<float>(type_convert<bf8_t>(neg_float)), abs_tol);
+    // positive subnorm float value to bf8 and back, check if holds
+    pos_float = 0.0000305175f;
+    ASSERT_NEAR(pos_float, type_convert<float>(type_convert<bf8_t>(pos_float)), abs_tol);
+    // negative subnorm float value to bf8 and back, check if holds
+    neg_float = -0.0000152587f;
+    ASSERT_NEAR(neg_float, type_convert<float>(type_convert<bf8_t>(neg_float)), abs_tol);
+}
+
+TEST(BF8, ConvertFP32Stochastic)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-6;
+    // convert 0 float to bf8 and back, check if holds
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_sr<bf8_t>(0.0f)), abs_tol);
+    // convert minimal float to bf8 and back, check if holds
+    ASSERT_NEAR(std::numeric_limits<float>::min(),
+                type_convert<float>(f8_convert_sr<bf8_t>(std::numeric_limits<float>::min())),
+                abs_tol);
+    // convert maximal bf8_t to float and check if equal to 57344.0
+    ASSERT_NEAR(57344.0f, type_convert<float>(f8_convert_sr<bf8_t>(57344.0f)), abs_tol);
+    // convert maximal float to bf8 and back, check if clipped to 57344.0
+    ASSERT_NEAR(57344.0f,
+                type_convert<float>(f8_convert_sr<bf8_t>(std::numeric_limits<float>::max())),
+                abs_tol);
+    // convert inf float to bf8_t and check if it is qNan
+    ASSERT_NEAR(type_convert<bf8_t>(0x80),
+                f8_convert_sr<bf8_t>(std::numeric_limits<float>::infinity()),
+                abs_tol);
+    // positive norm float value to bf8 and back, check if holds
+    float pos_float = 0.0000762939f;
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<bf8_t>(pos_float)), abs_tol);
+    // negative norm float value to bf8 and back, check if holds
+    float neg_float = -0.0000610351f;
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<bf8_t>(neg_float)), abs_tol);
+    // positive subnorm float value to bf8 and back, check if holds
+    pos_float = 0.0000305175f;
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<bf8_t>(pos_float)), abs_tol);
+    // negative subnorm float value to bf8 and back, check if holds
+    neg_float = -0.0000152587f;
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<bf8_t>(neg_float)), abs_tol);
+}
+
+TEST(BF8, ConvertFP16Nearest)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-3;
+    // convert 0 fp16 to bf8 and back, check if holds
+    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(type_convert<bf8_t>(half_t{0.0})), abs_tol);
+    // convert minimal fp16 to bf8 and back, check if holds
+    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
+                type_convert<half_t>(type_convert<bf8_t>(ck::NumericLimits<half_t>::Min())),
+                abs_tol);
+    // convert maximal bf8_t to fp16 and check if equal to 57344.0
+    ASSERT_NEAR(
+        half_t{57344.0}, type_convert<half_t>(type_convert<bf8_t>(half_t{57344.0})), abs_tol);
+    // convert maximal fp16 to bf8 and back, check if clipped to 57344.0
+    ASSERT_NEAR(half_t{57344.0},
+                type_convert<half_t>(type_convert<bf8_t>(ck::NumericLimits<half_t>::Max())),
+                abs_tol);
+    // convert QuietNaN fp16 to bf8_t and check if it is QuietNaN
+    ASSERT_NEAR(type_convert<bf8_t>(0x80),
+                type_convert<bf8_t>(ck::NumericLimits<half_t>::QuietNaN()),
+                abs_tol);
+    // positive norm fp16 value to bf8 and back, check if holds
+    half_t pos_half = half_t{0.0000762939};
+    ASSERT_NEAR(pos_half, type_convert<half_t>(type_convert<bf8_t>(pos_half)), abs_tol);
+    // negative norm fp16 value to bf8 and back, check if holds
+    half_t neg_half = half_t{-0.0000610351};
+    ASSERT_NEAR(neg_half, type_convert<half_t>(type_convert<bf8_t>(neg_half)), abs_tol);
+    // positive subnorm fp16 value to bf8 and back, check if holds
+    pos_half = half_t{0.0000305175};
+    ASSERT_NEAR(pos_half, type_convert<half_t>(type_convert<bf8_t>(pos_half)), abs_tol);
+    // negative subnorm fp16 value to bf8 and back, check if holds
+    neg_half = half_t{-0.0000152587};
+    ASSERT_NEAR(neg_half, type_convert<half_t>(type_convert<bf8_t>(neg_half)), abs_tol);
+}
+
+TEST(BF8, ConvertFP16Stochastic)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-3;
+    // convert 0 fp16 to bf8 and back, check if holds
+    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_sr<bf8_t>(half_t{0.0})), abs_tol);
+    // convert minimal fp16 to bf8 and back, check if holds
+    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
+                type_convert<half_t>(f8_convert_sr<bf8_t>(ck::NumericLimits<half_t>::Min())),
+                abs_tol);
+    // convert maximal bf8_t to fp16 and check if equal to 57344.0
+    ASSERT_NEAR(
+        half_t{57344.0}, type_convert<half_t>(f8_convert_sr<bf8_t>(half_t{57344.0})), abs_tol);
+    // convert maximal fp16 to bf8 and back, check if clipped to 57344.0
+    ASSERT_NEAR(half_t{57344.0},
+                type_convert<half_t>(f8_convert_sr<bf8_t>(ck::NumericLimits<half_t>::Max())),
+                abs_tol);
+    // convert QuietNaN fp16 to bf8_t and check if it is QuietNaN
+    ASSERT_NEAR(type_convert<bf8_t>(0x80),
+                f8_convert_sr<bf8_t>(ck::NumericLimits<half_t>::QuietNaN()),
+                abs_tol);
+    // positive norm fp16 value to bf8 and back, check if holds
+    half_t pos_half = half_t{0.0000762939};
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<bf8_t>(pos_half)), abs_tol);
+    // negative norm fp16 value to bf8 and back, check if holds
+    half_t neg_half = half_t{-0.0000610351};
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<bf8_t>(neg_half)), abs_tol);
+    // positive subnorm fp16 value to bf8 and back, check if holds
+    pos_half = half_t{0.0000305175};
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<bf8_t>(pos_half)), abs_tol);
+    // negative subnorm fp16 value to bf8 and back, check if holds
+    neg_half = half_t{-0.0000152587};
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<bf8_t>(neg_half)), abs_tol);
+}
--- a/test/data_type/fp8.cpp
+++ b/test/data_type/fp8.cpp
@@ -12,10 +12,11 @@ using ck::type_convert;

 TEST(FP8, NumericLimits)
 {
-    EXPECT_EQ(ck::NumericLimits<f8_t>::Min(), 0x08);
-    EXPECT_EQ(ck::NumericLimits<f8_t>::Max(), 0x77);
-    EXPECT_EQ(ck::NumericLimits<f8_t>::Lowest(), 0xF7);
-    EXPECT_EQ(ck::NumericLimits<f8_t>::QuietNaN(), 0x80);
+    // constants given for negative zero nan mode
+    EXPECT_EQ(ck::NumericLimits<f8_t>::Min(), type_convert<f8_t>(0x08));
+    EXPECT_EQ(ck::NumericLimits<f8_t>::Max(), type_convert<f8_t>(0x7F));
+    EXPECT_EQ(ck::NumericLimits<f8_t>::Lowest(), type_convert<f8_t>(0xFF));
+    EXPECT_EQ(ck::NumericLimits<f8_t>::QuietNaN(), type_convert<f8_t>(0x80));
 }

 TEST(FP8, ConvertFP32Nearest)
@@ -35,12 +36,20 @@ TEST(FP8, ConvertFP32Nearest)
                type_convert<float>(type_convert<f8_t>(std::numeric_limits<float>::max())),
                abs_tol);
    // convert inf float to f8_t and check if it is qNan
-    ASSERT_NEAR(0x80, type_convert<f8_t>(std::numeric_limits<float>::infinity()), abs_tol);
-    // positive float value to fp8 and back, check if holds
-    float pos_float = 0.0078125f;
+    ASSERT_NEAR(type_convert<f8_t>(0x80),
+                type_convert<f8_t>(std::numeric_limits<float>::infinity()),
+                abs_tol);
+    // positive norm float value to fp8 and back, check if holds
+    float pos_float = 0.017578125f;
+    ASSERT_NEAR(pos_float, type_convert<float>(type_convert<f8_t>(pos_float)), abs_tol);
+    // negative norm float value to fp8 and back, check if holds
+    float neg_float = -0.015625f;
+    ASSERT_NEAR(neg_float, type_convert<float>(type_convert<f8_t>(neg_float)), abs_tol);
+    // positive subnorm float value to fp8 and back, check if holds
+    pos_float = 0.00390625f;
    ASSERT_NEAR(pos_float, type_convert<float>(type_convert<f8_t>(pos_float)), abs_tol);
-    // negative float value to fp8 and back, check if holds
-    float neg_float = -0.0156250f;
+    // negative subnorm float value to fp8 and back, check if holds
+    neg_float = -0.001953125f;
    ASSERT_NEAR(neg_float, type_convert<float>(type_convert<f8_t>(neg_float)), abs_tol);
 }

@@ -61,12 +70,20 @@ TEST(FP8, ConvertFP32Stochastic)
                type_convert<float>(f8_convert_sr<f8_t>(std::numeric_limits<float>::max())),
                abs_tol);
    // convert inf float to f8_t and check if it is qNan
-    ASSERT_NEAR(0x80, f8_convert_sr<f8_t>(std::numeric_limits<float>::infinity()), abs_tol);
-    // positive float value to fp8 and back, check if holds
-    float pos_float = 0.0078125f;
+    ASSERT_NEAR(type_convert<f8_t>(0x80),
+                f8_convert_sr<f8_t>(std::numeric_limits<float>::infinity()),
+                abs_tol);
+    // positive norm float value to fp8 and back, check if holds
+    float pos_float = 0.017578125f;
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<f8_t>(pos_float)), abs_tol);
+    // negative norm float value to fp8 and back, check if holds
+    float neg_float = -0.015625f;
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<f8_t>(neg_float)), abs_tol);
+    // positive subnorm float value to fp8 and back, check if holds
+    pos_float = 0.00390625f;
    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<f8_t>(pos_float)), abs_tol);
-    // negative float value to fp8 and back, check if holds
-    float neg_float = -0.0156250f;
+    // negative subnorm float value to fp8 and back, check if holds
+    neg_float = -0.001953125f;
    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<f8_t>(neg_float)), abs_tol);
 }

@@ -87,12 +104,20 @@ TEST(FP8, ConvertFP16Nearest)
                type_convert<half_t>(type_convert<f8_t>(ck::NumericLimits<half_t>::Max())),
                abs_tol);
    // convert QuietNaN fp16 to f8_t and check if it is QuietNaN
-    ASSERT_NEAR(0x80, type_convert<f8_t>(ck::NumericLimits<half_t>::QuietNaN()), abs_tol);
-    // positive fp16 value to fp8 and back, check if holds
-    half_t pos_half = half_t{0.0078125};
+    ASSERT_NEAR(type_convert<f8_t>(0x80),
+                type_convert<f8_t>(ck::NumericLimits<half_t>::QuietNaN()),
+                abs_tol);
+    // positive norm fp16 value to fp8 and back, check if holds
+    half_t pos_half = half_t{0.017578125};
+    ASSERT_NEAR(pos_half, type_convert<half_t>(type_convert<f8_t>(pos_half)), abs_tol);
+    // negative norm fp16 value to fp8 and back, check if holds
+    half_t neg_half = half_t{-0.015625};
+    ASSERT_NEAR(neg_half, type_convert<half_t>(type_convert<f8_t>(neg_half)), abs_tol);
+    // positive subnorm fp16 value to fp8 and back, check if holds
+    pos_half = half_t{0.00390625};
    ASSERT_NEAR(pos_half, type_convert<half_t>(type_convert<f8_t>(pos_half)), abs_tol);
-    // negative fp16 value to fp8 and back, check if holds
-    half_t neg_half = half_t{-0.0156250};
+    // negative subnorm fp16 value to fp8 and back, check if holds
+    neg_half = half_t{-0.001953125};
    ASSERT_NEAR(neg_half, type_convert<half_t>(type_convert<f8_t>(neg_half)), abs_tol);
 }

@@ -113,11 +138,19 @@ TEST(FP8, ConvertFP16Stochastic)
                type_convert<half_t>(f8_convert_sr<f8_t>(ck::NumericLimits<half_t>::Max())),
                abs_tol);
    // convert QuietNaN fp16 to f8_t and check if it is QuietNaN
-    ASSERT_NEAR(0x80, f8_convert_sr<f8_t>(ck::NumericLimits<half_t>::QuietNaN()), abs_tol);
-    // positive fp16 value to fp8 and back, check if holds
-    half_t pos_half = half_t{0.0078125};
+    ASSERT_NEAR(type_convert<f8_t>(0x80),
+                f8_convert_sr<f8_t>(ck::NumericLimits<half_t>::QuietNaN()),
+                abs_tol);
+    // positive norm fp16 value to fp8 and back, check if holds
+    half_t pos_half = half_t{0.017578125};
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<f8_t>(pos_half)), abs_tol);
+    // negative norm fp16 value to fp8 and back, check if holds
+    half_t neg_half = half_t{-0.015625};
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<f8_t>(neg_half)), abs_tol);
+    // positive subnorm fp16 value to fp8 and back, check if holds
+    pos_half = half_t{0.00390625};
    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<f8_t>(pos_half)), abs_tol);
-    // negative fp16 value to fp8 and back, check if holds
-    half_t neg_half = half_t{-0.0156250};
+    // negative subnorm fp16 value to fp8 and back, check if holds
+    neg_half = half_t{-0.001953125};
    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<f8_t>(neg_half)), abs_tol);
 }
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data.cpp
@@ -87,6 +87,9 @@ TYPED_TEST(TestGroupedConvndBwdData2d, Test2D)
        {2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
    this->conv_params.push_back(
        {2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
    this->template Run<2>();
 }

@@ -99,5 +102,11 @@ TYPED_TEST(TestGroupedConvndBwdData3d, Test3D)
        {3, 2, 2, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->conv_params.push_back(
        {3, 2, 32, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 1, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->template Run<3>();
 }
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
@@ -14,6 +14,8 @@

 #include "profiler/profile_grouped_conv_bwd_weight_impl.hpp"

+using namespace ck::tensor_layout::convolution;
+
 template <typename Tuple>
 class TestGroupedConvndBwdWeight : public ::testing::Test
 {
@@ -27,28 +29,59 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
    using NDimSpatial = std::tuple_element_t<6, Tuple>;

    std::vector<ck::utils::conv::ConvParam> conv_params;
-    ck::index_t split_k{2};
+    std::vector<ck::index_t> split_ks{1, 2};
+
+    bool skip_case(const ck::utils::conv::ConvParam& params, const ck::index_t split_k)
+    {
+        // Odd K or C values are supported only by DL kernel (only applies to fp16)
+        // DL kernel currently supports only `split_k=1`
+        if constexpr(std::is_same_v<InDataType, ck::half_t>)
+        {
+            if(split_k != 1 && (params.K_ % 2 != 0 || params.C_ % 2 != 0))
+            {
+                return true;
+            }
+        }
+
+        // 1d NWGC is only supported by DL kernel
+        // DL kernel is only supported for split_k=1
+        if constexpr(std::is_same_v<InLayout, NWGC> && std::is_same_v<OutLayout, NWGK>)
+        {
+            if(split_k != 1)
+            {
+                return true;
+            }
+        }
+
+        return false;
+    }

    void Run()
    {
        EXPECT_FALSE(conv_params.empty());
        bool pass = true;

-        for(auto& param : conv_params)
+        for(auto split_k : split_ks)
        {
-            pass = pass && ck::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial{},
-                                                                              InLayout,
-                                                                              WeiLayout,
-                                                                              OutLayout,
-                                                                              InDataType,
-                                                                              WeiDataType,
-                                                                              OutDataType>(
-                               true,  // do_verification
-                               1,     // init_method: integer value
-                               false, // do_log
-                               false, // time_kernel
-                               param,
-                               split_k);
+            for(auto& param : conv_params)
+            {
+                if(!skip_case(param, split_k))
+                {
+                    pass = pass && ck::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial{},
+                                                                                      InLayout,
+                                                                                      WeiLayout,
+                                                                                      OutLayout,
+                                                                                      InDataType,
+                                                                                      WeiDataType,
+                                                                                      OutDataType>(
+                                       true,  // do_verification
+                                       1,     // init_method: integer value
+                                       false, // do_log
+                                       false, // time_kernel
+                                       param,
+                                       split_k);
+                }
+            }
        }
        EXPECT_TRUE(pass);
    }
@@ -69,12 +102,13 @@ class TestGroupedConvndBwdWeight3d : public TestGroupedConvndBwdWeight<Tuple>
 {
 };

-using namespace ck::tensor_layout::convolution;
-
 using KernelTypes1d = ::testing::Types<
    std::tuple<float, float, float, GNWC, GKXC, GNWK, ck::Number<1>>,
    std::tuple<ck::half_t, ck::half_t, ck::half_t, GNWC, GKXC, GNWK, ck::Number<1>>,
-    std::tuple<ck::bhalf_t, float, ck::bhalf_t, GNWC, GKXC, GNWK, ck::Number<1>>>;
+    std::tuple<ck::bhalf_t, float, ck::bhalf_t, GNWC, GKXC, GNWK, ck::Number<1>>,
+    std::tuple<float, float, float, NWGC, GKXC, NWGK, ck::Number<1>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, NWGC, GKXC, NWGK, ck::Number<1>>,
+    std::tuple<ck::bhalf_t, float, ck::bhalf_t, NWGC, GKXC, NWGK, ck::Number<1>>>;
 using KernelTypes2d = ::testing::Types<
    std::tuple<float, float, float, GNHWC, GKYXC, GNHWK, ck::Number<2>>,
    std::tuple<ck::half_t, ck::half_t, ck::half_t, GNHWC, GKYXC, GNHWK, ck::Number<2>>,

--- a/test/grouped_gemm/test_grouped_gemm_interface.cpp
+++ b/test/grouped_gemm/test_grouped_gemm_interface.cpp
@@ -108,7 +108,7 @@ TEST_F(TestGGemmSplitKInterface_MKNKMN, KLoops)

    // kloops % 2
    Ks = std::vector<int>{256, 512, 320, 768};
-    EXPECT_FALSE(
+    EXPECT_TRUE(
        DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch));

    // Not all gemms have same value for main_k0_block_loop!

--- a/test/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/grouped_gemm/test_grouped_gemm_util.hpp
@@ -147,14 +147,14 @@ struct DeviceGroupedGemmSplitkInstanceWrapper
            32,
            4,
            2,
-            S<1, 4, 32, 1>,
+            S<1, 4, 16, 1>,
            ABlockTransferThreadClusterArrageOrder,
            ABlockTransferSrcAccessOrder,
            ABlockTransferSrcVectorDim::value,
            ABlockTransferSrcScalarPerVector,
            ABlockTransferDstScalarPerVector_K1::value,
            ABlockLdsAddExtraM::value,
-            S<1, 4, 32, 1>,
+            S<1, 4, 16, 1>,
            BBlockTransferThreadClusterArrageOrder,
            BBlockTransferSrcAccessOrder,
            BBlockTransferSrcVectorDim::value,