Merge pull request #295 from ROCm/gfx950

Merge gfx950 changes into develop branch.

Merge pull request #295 from ROCm/gfx950
Merge gfx950 changes into develop branch.
fd7600ce · Illia Silin · GitHub · 5063a39f · 2ad07a88 · fd7600ce
Unverified Commit fd7600ce authored Feb 03, 2025 by Illia Silin Committed by GitHub Feb 03, 2025
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -189,17 +189,20 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
    add_definitions(-DCK_USE_XDL)
    set(CK_USE_XDL "ON")
 endif()
-if (SUPPORTED_GPU_TARGETS MATCHES "gfx94")
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx94" OR SUPPORTED_GPU_TARGETS MATCHES "gfx95")
    message("Enabling FP8 gemms on native architectures")
    add_definitions(-DCK_USE_GFX94)
    set(CK_USE_GFX94 "ON")
 endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx95")
+	add_definitions(-DCK_USE_AMD_MFMA_GFX950)
+endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
    message("Enabling WMMA instances")
    add_definitions(-DCK_USE_WMMA)
    set(CK_USE_WMMA "ON")
 endif()
-if (SUPPORTED_GPU_TARGETS MATCHES "gfx12")
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx950")
    add_definitions(-DCK_USE_OCP_FP8)
    set(CK_USE_OCP_FP8 "ON")
 endif()
@@ -207,6 +210,10 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx
    add_definitions(-DCK_USE_FNUZ_FP8)
    set(CK_USE_FNUZ_FP8 "ON")
 endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx950")
+    add_definitions(-DCK_USE_NATIVE_MX_SUPPORT)
+    set(CK_USE_NATIVE_MX_SUPPORT "ON")
+endif()
 option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
 if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908"))

--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -56,7 +56,7 @@ if (GPU_TARGETS)
        add_definitions(-DCK_USE_WMMA)
        set(CK_USE_WMMA "ON")
    endif()
-    if (GPU_TARGETS MATCHES "gfx12")
+    if (GPU_TARGETS MATCHES "gfx12" OR GPU_TARGETS MATCHES "gfx950")
        add_definitions(-DCK_USE_OCP_FP8)
        set(CK_USE_OCP_FP8 "ON")
    endif()

--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -61,7 +61,7 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp64)
 add_example_executable(example_gemm_xdl_streamk gemm_xdl_streamk.cpp)
-list(APPEND gpu_list gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
    if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -31,9 +31,7 @@ using DeviceGemmInstance0 = ck::tensor_operation::device::DeviceGemmXdl
 // ######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
 // ######|          |          |          |            |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>;
-// // clang-format on
-// clang-format off
 using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|

--- a/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
@@ -16,7 +16,7 @@ if(USE_BITINT_EXTENSION_INT4)
    add_example_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4)
 endif(USE_BITINT_EXTENSION_INT4)
-list(APPEND gpu_list gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
    if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/18_batched_gemm_reduce/CMakeLists.txt
+++ b/example/18_batched_gemm_reduce/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
@@ -32,6 +32,56 @@ using BiasLayout = typename LayoutSettingSelector<NDimSpatial>::BiasLayout;
 template <ck::index_t NDimSpatial>
 using ResidualLayout = typename LayoutSettingSelector<NDimSpatial>::ResidualLayout;
+#if defined(CK_USE_AMD_MFMA_GFX950)
+template <ck::index_t NDimSpatial>
+using DeviceConvFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InputLayout<NDimSpatial>,
+        WeightLayout<NDimSpatial>,
+        ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>,
+        OutputLayout<NDimSpatial>,
+        InKernelDataType,
+        WeiKernelDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        64,          // KPerBlock
+        16,          // AK1
+        16,          // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        4,           // ABlockTransferSrcScalarPerVector
+        4,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        4,           // BBlockTransferSrcScalarPerVector
+        4,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 16, 1, 16>,
+        4>;
+#else  // defined(CK_USE_AMD_MFMA_GFX950)
 template <ck::index_t NDimSpatial>
 using DeviceConvFwdInstance =
    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
@@ -80,6 +130,7 @@ using DeviceConvFwdInstance =
        1,
        S<1, 16, 1, 16>,
        4>;
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
 template <ck::index_t NDimSpatial>
 using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,

--- a/example/31_batched_gemm_gemm/CMakeLists.txt
+++ b/example/31_batched_gemm_gemm/CMakeLists.txt
@@ -5,6 +5,6 @@ if(USE_BITINT_EXTENSION_INT4)
   add_example_executable(example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp)
 endif(USE_BITINT_EXTENSION_INT4)
-if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx1")
+if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx95" AND NOT GPU_TARGETS MATCHES "gfx1")
   add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
 endif()
--- a/example/41_grouped_conv_conv_fwd/CMakeLists.txt
+++ b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
@@ -5,6 +5,6 @@ if(USE_BITINT_EXTENSION_INT4)
   add_example_executable(example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp)
 endif(USE_BITINT_EXTENSION_INT4)
-if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx1")
+if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx95" AND NOT GPU_TARGETS MATCHES "gfx1")
   add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
 endif()
--- a/example/62_convnd_activ/binary/CMakeLists.txt
+++ b/example/62_convnd_activ/binary/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/convinvscale/CMakeLists.txt
+++ b/example/62_convnd_activ/convinvscale/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/convscale/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/convscale_add/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_add/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/convscale_reduce/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_reduce/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/convscale_relu/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_relu/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
+++ b/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/multi_AB/CMakeLists.txt
+++ b/example/62_convnd_activ/multi_AB/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/unary/CMakeLists.txt
+++ b/example/62_convnd_activ/unary/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/67_gemm_microscaling/CMakeLists.txt
+++ b/example/67_gemm_microscaling/CMakeLists.txt
+add_custom_target(example_gemm_mx)
+add_example_executable(example_gemm_mx_fp8 gemm_mx_fp8.cpp)
+add_example_dependencies(example_gemm_mx example_gemm_mx_fp8)
--- a/example/67_gemm_microscaling/README.md
+++ b/example/67_gemm_microscaling/README.md
+# GEMM Examples for Microscaling Formats
+## example_gemm_mx_fp8
+```bash
+# arg1: verification (0=no, 1=CPU)
+# arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+# arg3: time kernel (0=no, 1=yes)
+# arg4: verbosity (0=no info, 1=verbose info)
+# arg5 to 10: M (16x), N(16x), K(16x), StrideA, StrideB, StrideC
+./bin/example_gemm_mx_fp8 1 1 0 1
+```
+```bash
+# Implies: ./bin/example_gemm_mx_fp8 1 2 0 0
+./bin/example_gemm_mx_fp8
+```
\ No newline at end of file