Merge remote-tracking branch 'internal/andriy/lwpck-2788' into andriy/lwpck-2788

f3af1da6 · Andriy Roshchenko · 2bef5501 · 60b885ae · f3af1da6 · f3af1da6
Commit f3af1da6 authored Feb 05, 2025 by Andriy Roshchenko
8 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -541,7 +541,7 @@ endif()
 message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")

 if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-    add_compile_options(-fcolor-diagnostics)
+   # add_compile_options(-fcolor-diagnostics)
 endif()
 if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9)
    add_compile_options(-fdiagnostics-color=always)

--- a/CMakePresets.json
+++ b/CMakePresets.json
+{
+    "version": 3,
+    "configurePresets": [
+        {
+            "name": "linux-debug",
+            "displayName": "Linux Debug",
+            "hidden": true,
+            "generator": "Unix Makefiles",
+            "binaryDir": "${sourceDir}/build/${presetName}",
+            "installDir": "${sourceDir}/build/install/${presetName}",
+            "environment": {
+                "MY_ENVIRONMENT_VARIABLE": "NONE",
+                "PATH": "/usr/local/.cargo/bin:$penv{PATH}",
+                "SCCACHE_IDLE_TIMEOUT": "11000"
+            },
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Debug",
+                "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+                "BUILD_DEV": "ON",
+                "CMAKE_CXX_COMPILER": "/opt/rocm/bin/hipcc",
+                "CMAKE_PREFIX_PATH": "/opt/rocm",
+                "CMAKE_CXX_COMPILER_LAUNCHER": "sccache",
+                "CMAKE_C_COMPILER_LAUNCHER": "sccache"
+            },
+            "condition": {
+                "type": "equals",
+                "lhs": "${hostSystemName}",
+                "rhs": "Linux"
+            }
+        },
+        {
+            "name": "MI355-debug",
+            "displayName": "MI355 Debug",
+            "inherits": "linux-debug",
+            "description": "Development Environment for MI355.",
+            "cacheVariables": {
+                "GPU_TARGETS": "gfx950",
+                "CMAKE_BUILD_TYPE": "Debug",
+                "CMAKE_CXX_FLAGS": "-O0 -ggdb"
+            }
+        },
+        {
+            "name": "MI355-release",
+            "displayName": "MI355 Release",
+            "inherits": "linux-debug",
+            "cacheVariables": {
+                "GPU_TARGETS": "gfx950",
+                "CMAKE_BUILD_TYPE": "Release",
+                "CMAKE_CXX_FLAGS": "-O3"
+            }
+        },
+        {
+            "name": "MI300X-release",
+            "displayName": "MI300X Release",
+            "inherits": "linux-debug",
+            "cacheVariables": {
+                "GPU_TARGETS": "gfx942",
+                "CMAKE_BUILD_TYPE": "Release",
+                "CMAKE_CXX_FLAGS": "-O3"
+            }
+        },
+        {
+            "name": "MI250-release",
+            "displayName": "MI250 Release",
+            "inherits": "linux-debug",
+            "cacheVariables": {
+                "GPU_TARGETS": "gfx90a",
+                "CMAKE_BUILD_TYPE": "Release",
+                "CMAKE_CXX_FLAGS": "-O3",
+                "CK_USE_FP8_ON_UNSUPPORTED_ARCH":"ON"
+            }
+        },
+        {
+            "name": "MI250-debug",
+            "displayName": "MI250 Debug",
+            "inherits": "linux-debug",
+            "cacheVariables": {
+                "GPU_TARGETS": "gfx90a",
+                "CMAKE_BUILD_TYPE": "Debug",
+                "CMAKE_CXX_FLAGS": "-O0 -ggdb",
+                "CK_USE_FP8_ON_UNSUPPORTED_ARCH":"ON"
+            }
+        },
+        {
+            "name": "RX7800-release",
+            "displayName": "RX7800 Release",
+            "inherits": "linux-debug",
+            "cacheVariables": {
+                "GPU_TARGETS": "gfx1101",
+                "DL_KERNELS": "ON",
+                "CMAKE_BUILD_TYPE": "Release",
+                "CMAKE_CXX_FLAGS": "-O3"
+            }
+        },
+        {
+            "name": "RX7800-debug",
+            "displayName": "RX7800 Debug",
+            "inherits": "linux-debug",
+            "cacheVariables": {
+                "GPU_TARGETS": "gfx1101",
+                "DL_KERNELS": "ON",
+                "CMAKE_BUILD_TYPE": "Debug",
+                "CMAKE_CXX_FLAGS": "-O0 -ggdb"
+            }
+        }
+    ],
+    "buildPresets": [
+        {
+            "name": "Debug",
+            "hidden": true,
+            "configuration": "Debug"
+        },
+        {
+            "name": "Release",
+            "hidden": true,
+            "configuration": "Release"
+        },
+        {
+            "name": "MI355-debug",
+            "displayName": "MI355",
+            "configurePreset": "MI355-debug",
+            "description": "Build Environment for MI355 Debug.",
+            "inherits": [
+                "Debug"
+            ],
+            "jobs": 128
+        },
+        {
+            "name": "MI355-release",
+            "displayName": "MI355",
+            "configurePreset": "MI355-release",
+            "description": "Build Environment for MI355 Release.",
+            "inherits": [
+                "Release"
+            ],
+            "jobs": 128
+        },
+        {
+            "name": "MI300X-release",
+            "displayName": "MI300X",
+            "configurePreset": "MI300X-release",
+            "description": "Build Environment for MI300X Release.",
+            "inherits": [
+                "Release"
+            ],
+            "jobs": 128
+        },
+        {
+            "name": "MI250-release",
+            "displayName": "MI250",
+            "configurePreset": "MI250-release",
+            "description": "Build Environment for MI250 Release.",
+            "inherits": [
+                "Release"
+            ],
+            "jobs": 128
+        },
+        {
+            "name": "MI250-debug",
+            "displayName": "MI250",
+            "configurePreset": "MI250-debug",
+            "description": "Build Environment for MI250 Debug.",
+            "inherits": [
+                "Debug"
+            ],
+            "jobs": 128
+        },
+        {
+            "name": "RX7800-release",
+            "displayName": "RX7800",
+            "configurePreset": "RX7800-release",
+            "description": "Build Environment for RX7800 Release.",
+            "inherits": [
+                "Release"
+            ],
+            "jobs": 128
+        },
+        {
+            "name": "RX7800-debug",
+            "displayName": "RX7800",
+            "configurePreset": "RX7800-debug",
+            "description": "Build Environment for RX7800 Debug.",
+            "inherits": [
+                "Debug"
+            ],
+            "jobs": 128
+        }
+    ]
+}
--- a/include/ck/library/utility/host_tensor_generator.hpp
+++ b/include/ck/library/utility/host_tensor_generator.hpp
@@ -359,6 +359,21 @@ struct GeneratorTensor_Sequential
    }
 };

+template <ck::index_t Dim>
+struct GeneratorTensor_Sequential<ck::e8m0_bexp_t, Dim>
+{
+    int offset = 0;
+
+    template <typename... Ts>
+    ck::e8m0_bexp_t operator()(Ts... Xs) const
+    {
+        std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
+
+        int tmp = dims[Dim];
+        return ck::type_convert<ck::e8m0_bexp_t>(powf(2, tmp + offset));
+    }
+};
+
 template <typename T, size_t NumEffectiveDim = 2>
 struct GeneratorTensor_Diagonal
 {

--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -780,7 +780,6 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf8f8>
    }
 };

-// TODO: fix mfma...f8f6f4 instructions
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_32x32x64f8f6f4>
 {
@@ -847,9 +846,14 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>
    // clang-format on

    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
-    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    __device__ void run(const FloatA& a,
+                        const int32_t& scale_a,
+                        const FloatB& b,
+                        const int32_t& scale_b,
+                        FloatC& reg_c) const
    {
-        intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+        intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops>::Run(
+            a, scale_a, b, scale_b, reg_c);
    }
 };

@@ -871,9 +875,14 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>
    // clang-format on

    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
-    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
-    {
-        intrin_mfma_scale_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    __device__ void run(const FloatA& a,
+                        const int32_t& scale_a,
+                        const FloatB& b,
+                        const int32_t& scale_b,
+                        FloatC& reg_c) const
+    {
+        intrin_mfma_scale_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops>::Run(
+            a, scale_a, b, scale_b, reg_c);
    }
 };


--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -519,12 +519,36 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
 {
    template <class FloatC>
    __device__ static void Run(const f8x32_t& reg_a,
-                               const int32_t scale_a,
+                               const int32_t& scale_a,
                               const f8x32_t& reg_b,
-                               const int32_t scale_b,
+                               const int32_t& scale_b,
                               FloatC& reg_c)
    {
 #if defined(__gfx950__)
+        if(threadIdx.x == 0 || threadIdx.x == 32)
+        {
+            printf("thread: %u -- xA: %x\n", threadIdx.x, static_cast<uint32_t>(scale_a));
+            printf("thread: %u -- xB: %x\n", threadIdx.x, static_cast<uint32_t>(scale_b));
+
+            // printf("intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32> thread: %u -- scale_a: %f\n",
+            //        threadIdx.x,
+            //        static_cast<float>(ck::e8m0_bexp_t(scale_a)));
+            // printf("intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32> thread: %u -- scale_b: %f\n",
+            //        threadIdx.x,
+            //        static_cast<float>(ck::e8m0_bexp_t(scale_b)));
+
+            // for(size_t i = 0; i < 32; i++)
+            // {
+            //     printf("thread: %u -- reg_a[%zu]: %f\n",
+            //            threadIdx.x,
+            //            i,
+            //            type_convert<float>(f8_t{static_cast<f8x32_t::data_v>(reg_a)[i]}));
+            //     // printf("thread: %u -- reg_a[%zu]: %f\n",
+            //     //        threadIdx.x,
+            //     //        i,
+            //     //        type_convert<float>(f8_t{static_cast<f8x32_t::data_v>(reg_b)[i]}));
+            // }
+        }
        // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
        reg_c.template AsType<float16_t>()(Number<0>{}) =
            __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(

--- a/test/mx_mfma_op/mx_mfma_op.cpp
+++ b/test/mx_mfma_op/mx_mfma_op.cpp
@@ -30,11 +30,11 @@ bool run_mfma_test(ck::index_t init)
    constexpr auto BLOCK_N = mfma_instr.n_per_blk;
    constexpr auto BLOCK_K = mfma_instr.num_input_blks * mfma_instr.k_per_blk;

-    const auto mx_mfma_kernel = ck::matmul<AType, BType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K>;
+    const auto mfma_kernel = ck::matmul<AType, BType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K>;

    bool pass = true;

-    pass = ck::mfma_test::TestMFMA<decltype(mx_mfma_kernel),
+    pass = ck::mfma_test::TestMFMA<decltype(mfma_kernel),
                                   AType,
                                   BType,
                                   CType,
@@ -45,7 +45,7 @@ bool run_mfma_test(ck::index_t init)
                                   CLayout,
                                   BLOCK_M,
                                   BLOCK_N,
-                                   BLOCK_K>{}(mx_mfma_kernel, init);
+                                   BLOCK_K>{}(mfma_kernel, init);

    return pass;
 }
@@ -63,3 +63,98 @@ TEST(MFMA, FP8MFMA32x32x64)
    auto pass    = run_mfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::F32_32x32x64>(AB_init);
    EXPECT_TRUE(pass);
 }
+
+/**
+ * @brief Run the test for the given MX MFMA instruction
+ *
+ * @param init - selects initialization algorithm for A and B tensors
+ */
+template <typename AType, typename BType, typename CType, ck::MFMA_F8F6F4 mfma>
+bool run_mxmfma_test(ck::index_t init)
+{
+    static_assert(mfma == ck::MFMA_F8F6F4::SCALE_F32_16x16x128 ||
+                      mfma == ck::MFMA_F8F6F4::SCALE_F32_32x32x64,
+                  "Only SCALE_F32_16x16x128 and SCALE_F32_32x32x64 are supported");
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    using AccType = float; // only MFMA_F32 instructions supported
+    // using CPUAccType = AccType;
+    using ScaleType = ck::e8m0_bexp_t; // biased exponent type
+
+    ck::mfma_type<static_cast<ck::MfmaInstr>(mfma)> mfma_instr;
+    constexpr auto BLOCK_M = mfma_instr.m_per_blk;
+    constexpr auto BLOCK_N = mfma_instr.n_per_blk;
+    constexpr auto BLOCK_K = mfma_instr.num_input_blks * mfma_instr.k_per_blk;
+    constexpr auto BLOCK_X = 32; // scaling vector size
+
+    const auto mx_mfma_kernel =
+        ck::matmul<AType, BType, ScaleType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K, BLOCK_X>;
+
+    bool pass = true;
+
+    pass = ck::mxmfma_test::TestMXMFMA<decltype(mx_mfma_kernel),
+                                       AType,
+                                       BType,
+                                       ScaleType,
+                                       CType,
+                                       ALayout,
+                                       BLayout,
+                                       CLayout,
+                                       BLOCK_M,
+                                       BLOCK_N,
+                                       BLOCK_K,
+                                       BLOCK_X>{}(mx_mfma_kernel, init);
+
+    return pass;
+}
+
+TEST(MXMFMA, MXFP8MFMA16x16x128i2)
+{
+    auto AB_init = 2;
+    auto pass = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXFP8MFMA32x32x64i2)
+{
+    auto AB_init = 2;
+    auto pass    = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXFP8MFMA16x16x128i3)
+{
+    auto AB_init = 3;
+    auto pass = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXFP8MFMA32x32x64i3)
+{
+    auto AB_init = 3;
+    auto pass    = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXFP8MFMA16x16x128i4)
+{
+    auto AB_init = 4;
+    auto pass = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXFP8MFMA32x32x64i4)
+{
+    auto AB_init = 4;
+    auto pass    = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXFP8MFMA32x32x64i5)
+{
+    auto AB_init = 5;
+    auto pass    = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+    EXPECT_TRUE(pass);
+}
--- a/test/mx_mfma_op/mx_mfma_op.hpp
+++ b/test/mx_mfma_op/mx_mfma_op.hpp
--- a/test/mx_mfma_op/scale_mfma_repro.cpp
+++ b/test/mx_mfma_op/scale_mfma_repro.cpp
+#include <hip/hip_ext.h>
+#include <hip/hip_runtime.h>
+
+__global__ void kernel()
+{
+    using dataAB = uint8_t __attribute__((ext_vector_type(32)));
+    using dataC  = float __attribute__((ext_vector_type(16)));
+    using dataX  = int32_t __attribute__((ext_vector_type(2)));
+
+    dataAB regA(0x38);
+    dataAB regB(0x38);
+    dataC regC(1.0f);
+    // dataC regCin(1.0f);
+#if 1
+    // dataX xa{127, 127};   // 1.0
+    dataX xa(127 & 0xFF); // 1.0
+    dataX xb(127 & 0xFF); // 1.0
+#else
+    dataX xa(0);
+    dataX xb(0);
+#endif
+#if 0
+    if(threadIdx.x == 0)
+    {
+        // xa = 127; // 1.0
+        for(size_t i = 0; i < 32; i++)
+        {
+            regA[i] = 0x38; // 1.0
+        }
+        for(size_t i = 0; i < 32; i++)
+        {
+            regB[i] = 0x38; // 1.0
+        }
+        printf("thread: %u -- xA: %x\n", threadIdx.x, xa[threadIdx.x / 32]);
+        printf("thread: %u -- xB: %x\n", threadIdx.x, xb[threadIdx.x / 32]);
+    }
+
+    if(threadIdx.x == 32)
+    {
+        // xa = 126; // 0.5
+        for(size_t i = 0; i < 32; i++)
+        {
+            regA[i] = 0xC0; // -2.0
+        }
+        for(size_t i = 0; i < 32; i++)
+        {
+            regB[i] = 0x38; // 1.0
+        }
+        printf("thread: %u -- xA: %x\n", threadIdx.x, xa[threadIdx.x / 32]);
+        printf("thread: %u -- xB: %x\n", threadIdx.x, xb[threadIdx.x / 32]);
+    }
+#endif
+
+    __syncthreads();
+    printf("thread: %u -- regA: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x "
+           "%x %x %x %x %x %x %x %x %x %x\n",
+           threadIdx.x,
+           regA[0],
+           regA[1],
+           regA[2],
+           regA[3],
+           regA[4],
+           regA[5],
+           regA[6],
+           regA[7],
+           regA[8],
+           regA[9],
+           regA[10],
+           regA[11],
+           regA[12],
+           regA[13],
+           regA[14],
+           regA[15],
+           regA[16],
+           regA[17],
+           regA[18],
+           regA[19],
+           regA[20],
+           regA[21],
+           regA[22],
+           regA[23],
+           regA[24],
+           regA[25],
+           regA[26],
+           regA[27],
+           regA[28],
+           regA[29],
+           regA[30],
+           regA[31]);
+
+    printf("thread: %u -- regB: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x "
+           "%x %x %x %x %x %x %x %x %x %x\n",
+           threadIdx.x,
+           regB[0],
+           regB[1],
+           regB[2],
+           regB[3],
+           regB[4],
+           regB[5],
+           regB[6],
+           regB[7],
+           regB[8],
+           regB[9],
+           regB[10],
+           regB[11],
+           regB[12],
+           regB[13],
+           regB[14],
+           regB[15],
+           regB[16],
+           regB[17],
+           regB[18],
+           regB[19],
+           regB[20],
+           regB[21],
+           regB[22],
+           regB[23],
+           regB[24],
+           regB[25],
+           regB[26],
+           regB[27],
+           regB[28],
+           regB[29],
+           regB[30],
+           regB[31]);
+
+    //__builtin_amdgcn_mfma_ld_scale_b32(xb[threadIdx.x / 32], 0, 0);
+    regC = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(regA,
+                                                           regB,
+                                                           regC,
+                                                           0, // cbsz
+                                                           0, // blgp
+                                                           0,
+                                                           xa[threadIdx.x / 32],
+                                                           0,
+                                                           xb[threadIdx.x / 32]);
+
+    __syncthreads();
+
+    printf("thread: %u -- regC: %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n",
+           threadIdx.x,
+           regC[0],
+           regC[1],
+           regC[2],
+           regC[3],
+           regC[4],
+           regC[5],
+           regC[6],
+           regC[7],
+           regC[8],
+           regC[9],
+           regC[10],
+           regC[11],
+           regC[12],
+           regC[13],
+           regC[14],
+           regC[15]);
+
+    // printf("thread: %u -- regCin: %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n",
+    //        threadIdx.x,
+    //        regCin[0],
+    //        regCin[1],
+    //        regCin[2],
+    //        regCin[3],
+    //        regCin[4],
+    //        regCin[5],
+    //        regCin[6],
+    //        regCin[7],
+    //        regCin[8],
+    //        regCin[9],
+    //        regCin[10],
+    //        regCin[11],
+    //        regCin[12],
+    //        regCin[13],
+    //        regCin[14],
+    //        regCin[15]);
+}
+
+int main()
+{
+    kernel<<<1, 64>>>();
+    return 0;
+}
\ No newline at end of file