Add new mfma instructions and examples

24673871 · root · 1f127242 · 24673871 · 24673871 · 24673871
Commit 24673871 authored Nov 13, 2024 by root
7 changed files
--- a/example/01_gemm/gemm_xdl_bf16.cpp
+++ b/example/01_gemm/gemm_xdl_bf16.cpp
@@ -27,7 +27,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+#if !defined(CK_USE_AMD_MFMA_GFX950)
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    64,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   64,   64,    64,   16,   16,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   64,   64,    128,   16,   16,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+#else
         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    64,   16,   16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   64,   64,    128,   32,   32,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+#endif
 // clang-format on
 using ReferenceGemmInstance = ck::tensor_operation::host::

--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -15,7 +15,8 @@ using CDataType        = ck::half_t;
 using F16 = ck::half_t;
 using ALayout = Row;
-using BLayout = Row;
+using BLayout = Row; // DeviceGemmXdl or DeviceGemm_Xdl_CShuffle with ck::LoopScheduler::Interwave
+// using BLayout = Col;    // DeviceGemm_Xdl_CShuffle with ck::LoopScheduler::Default
 using CLayout = Row;
 using AElementOp = PassThrough;
@@ -30,19 +31,39 @@ using DeviceGemmInstance0 = ck::tensor_operation::device::DeviceGemmXdl
 // ######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
 // ######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
 // ######|          |          |          |            |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>;
+#if !defined(CK_USE_AMD_MFMA_GFX950)
-// // clang-format on
+    //     < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>;
+    //     < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>;
+//         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>;
+    //     < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   256,   64,   64,     4,  16,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>;
+    //     < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   256,   64,   64,     4,  16,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>;
+#else
+         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>;
+    //     < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   256,   64,   64,     4,  32,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>;
+#endif
+    // clang-format on
-// clang-format off
+    // clang-format off
 using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           2,              S<1, 16, 1, 16>,               8, ck::LoopScheduler::Interwave, ck::PipelineVersion::v1>;
+#if !defined(CK_USE_AMD_MFMA_GFX950)
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           2,              S<1, 16, 1, 16>,               8, ck::LoopScheduler::Interwave, ck::PipelineVersion::v1>;
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    64,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   64,   64,    64,   16,   16,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   64,   64,    128,   16,   16,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+#else
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    64,   16,   16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   64,   64,    128,   32,   32,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+#endif
 // clang-format on
-using DeviceGemmInstance = DeviceGemmInstance1;
+// using DeviceGemmInstance = DeviceGemmInstance1;
+using DeviceGemmInstance = DeviceGemmInstance0;
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;

--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
@@ -27,7 +27,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+#if !defined(CK_USE_AMD_MFMA_GFX950)
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    128,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   64,   64,    128,  32,  32,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 8>,              4>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   64,   64,    256,  32,  32,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 8>,              4>;
+#else
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    128,  32,  32,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   64,   64,    256,  64,  64,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 8>,              4>;
+#endif
 // clang-format on
 using ReferenceGemmInstance = ck::tensor_operation::host::

--- a/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc
+++ b/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc
@@ -127,44 +127,47 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
    switch(init_method)
    {
-        case 0: break;
+    case 0: break;
-        case 1:
+    case 1:
-            a_ms_ks_re.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        a_ms_ks_re.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-            b_ns_ks_re.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        b_ns_ks_re.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-            d_ms_ns_re.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_ms_ns_re.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-            a_ms_ks_img.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        a_ms_ks_img.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-            b_ns_ks_img.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        b_ns_ks_img.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-            d_ms_ns_img.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_ms_ns_img.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-            break;
+        break;
-        default:
+    default:
-            a_ms_ks_re.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        a_ms_ks_re.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-            b_ns_ks_re.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        b_ns_ks_re.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-            d_ms_ns_re.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_ms_ns_re.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-            a_ms_ks_img.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        a_ms_ks_img.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-            b_ns_ks_img.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        b_ns_ks_img.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-            d_ms_ns_img.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_ms_ns_img.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-            break;
+        break;
    }
    DeviceMem a_device_buf_re(sizeof(ADataType) * a_ms_ks_re.mDesc.GetElementSpaceSize());
    DeviceMem b_device_buf_re(sizeof(BDataType) * b_ns_ks_re.mDesc.GetElementSpaceSize());
    DeviceMem d_device_buf_re(sizeof(DDataType) * d_ms_ns_re.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf_re(sizeof(EDataType) * e_ms_ns_device_result_re.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf_re(sizeof(EDataType) *
+                              e_ms_ns_device_result_re.mDesc.GetElementSpaceSize());
    DeviceMem a_device_buf_img(sizeof(ADataType) * a_ms_ks_img.mDesc.GetElementSpaceSize());
    DeviceMem b_device_buf_img(sizeof(BDataType) * b_ns_ks_img.mDesc.GetElementSpaceSize());
    DeviceMem d_device_buf_img(sizeof(DDataType) * d_ms_ns_img.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf_img(sizeof(EDataType) * e_ms_ns_device_result_img.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf_img(sizeof(EDataType) *
+                               e_ms_ns_device_result_img.mDesc.GetElementSpaceSize());
    // Intermediate Value For E Real and Img
-    DeviceMem e_device_buf_re1(sizeof(EDataType) * e_ms_ns_device_result_re.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf_re1(sizeof(EDataType) *
-    DeviceMem e_device_buf_img1(sizeof(EDataType) * e_ms_ns_device_result_img.mDesc.GetElementSpaceSize());
+                               e_ms_ns_device_result_re.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf_img1(sizeof(EDataType) *
+                                e_ms_ns_device_result_img.mDesc.GetElementSpaceSize());
    a_device_buf_re.ToDevice(a_ms_ks_re.mData.data());
    b_device_buf_re.ToDevice(b_ns_ks_re.mData.data());
@@ -181,7 +184,7 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
    // set zero for intermediate values
    e_device_buf_re1.SetZero();
    e_device_buf_img1.SetZero();
    auto a_element_op   = AElementOp{};
    auto b_element_op   = BElementOp{};
    auto cde_element_op = CDEElementOp{alpha, beta};
@@ -189,23 +192,24 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
    // device operation
    // For real Intermediate Value re_1
-    auto op       = DeviceOpInstance{};
+    auto op      = DeviceOpInstance{};
-    auto invoker  = op.MakeInvoker();
+    auto invoker = op.MakeInvoker();
-    auto argument_re1 = op.MakeArgument(a_device_buf_re.GetDeviceBuffer(),
+    auto argument_re1 =
-                                    b_device_buf_re.GetDeviceBuffer(),
+        op.MakeArgument(a_device_buf_re.GetDeviceBuffer(),
-                                    std::array<const void*, 1>{d_device_buf_re.GetDeviceBuffer()},
+                        b_device_buf_re.GetDeviceBuffer(),
-                                    e_device_buf_re1.GetDeviceBuffer(),
+                        std::array<const void*, 1>{d_device_buf_re.GetDeviceBuffer()},
-                                    a_ms_ks_lengths,
+                        e_device_buf_re1.GetDeviceBuffer(),
-                                    a_ms_ks_strides,
+                        a_ms_ks_lengths,
-                                    b_ns_ks_lengths,
+                        a_ms_ks_strides,
-                                    b_ns_ks_strides,
+                        b_ns_ks_lengths,
-                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                        b_ns_ks_strides,
-                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
-                                    e_ms_ns_lengths,
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
-                                    e_ms_ns_strides,
+                        e_ms_ns_lengths,
-                                    a_element_op,
+                        e_ms_ns_strides,
-                                    b_element_op,
+                        a_element_op,
-                                    cde_element_op);
+                        b_element_op,
+                        cde_element_op);
    if(!op.IsSupportedArgument(argument_re1))
    {
@@ -216,7 +220,6 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
    float ave_time_re1 = invoker.Run(argument_re1, StreamConfig{nullptr, time_kernel});
    alpha = -1.f;
    beta  = 1.f;
@@ -228,21 +231,22 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
    // For real Intermediate Value re_2
    // auto op       = DeviceOpInstance{};
    // auto invoker  = op.MakeInvoker();
-    auto argument_re2 = op.MakeArgument(a_device_buf_img.GetDeviceBuffer(),
+    auto argument_re2 =
-                                    b_device_buf_img.GetDeviceBuffer(),
+        op.MakeArgument(a_device_buf_img.GetDeviceBuffer(),
-                                    std::array<const void*, 1>{e_device_buf_re1.GetDeviceBuffer()},
+                        b_device_buf_img.GetDeviceBuffer(),
-                                    e_device_buf_re.GetDeviceBuffer(),
+                        std::array<const void*, 1>{e_device_buf_re1.GetDeviceBuffer()},
-                                    a_ms_ks_lengths,
+                        e_device_buf_re.GetDeviceBuffer(),
-                                    a_ms_ks_strides,
+                        a_ms_ks_lengths,
-                                    b_ns_ks_lengths,
+                        a_ms_ks_strides,
-                                    b_ns_ks_strides,
+                        b_ns_ks_lengths,
-                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                        b_ns_ks_strides,
-                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
-                                    e_ms_ns_lengths,
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
-                                    e_ms_ns_strides,
+                        e_ms_ns_lengths,
-                                    a_element_op,
+                        e_ms_ns_strides,
-                                    b_element_op,
+                        a_element_op,
-                                    cde_element_op);
+                        b_element_op,
+                        cde_element_op);
    if(!op.IsSupportedArgument(argument_re2))
    {
@@ -253,7 +257,6 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
    float ave_time_re2 = invoker.Run(argument_re2, StreamConfig{nullptr, time_kernel});
    alpha = 1.f;
    beta  = 1.f;
@@ -261,22 +264,22 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
    b_element_op   = BElementOp{};
    cde_element_op = CDEElementOp{alpha, beta};
-    auto argument_img1 = op.MakeArgument(a_device_buf_re.GetDeviceBuffer(),
+    auto argument_img1 =
-                                b_device_buf_img.GetDeviceBuffer(),
+        op.MakeArgument(a_device_buf_re.GetDeviceBuffer(),
-                                std::array<const void*, 1>{d_device_buf_img.GetDeviceBuffer()},
+                        b_device_buf_img.GetDeviceBuffer(),
-                                e_device_buf_img1.GetDeviceBuffer(),
+                        std::array<const void*, 1>{d_device_buf_img.GetDeviceBuffer()},
-                                a_ms_ks_lengths,
+                        e_device_buf_img1.GetDeviceBuffer(),
-                                a_ms_ks_strides,
+                        a_ms_ks_lengths,
-                                b_ns_ks_lengths,
+                        a_ms_ks_strides,
-                                b_ns_ks_strides,
+                        b_ns_ks_lengths,
-                                std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                        b_ns_ks_strides,
-                                std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
-                                e_ms_ns_lengths,
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
-                                e_ms_ns_strides,
+                        e_ms_ns_lengths,
-                                a_element_op,
+                        e_ms_ns_strides,
-                                b_element_op,
+                        a_element_op,
-                                cde_element_op);
+                        b_element_op,
+                        cde_element_op);
    if(!op.IsSupportedArgument(argument_img1))
    {
@@ -290,23 +293,22 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
    alpha = 1.f;
    beta  = 1.f;
-    auto argument_img2 = op.MakeArgument(a_device_buf_img.GetDeviceBuffer(),
+    auto argument_img2 =
-                                b_device_buf_re.GetDeviceBuffer(),
+        op.MakeArgument(a_device_buf_img.GetDeviceBuffer(),
-                                std::array<const void*, 1>{e_device_buf_img1.GetDeviceBuffer()},
+                        b_device_buf_re.GetDeviceBuffer(),
-                                e_device_buf_img.GetDeviceBuffer(),
+                        std::array<const void*, 1>{e_device_buf_img1.GetDeviceBuffer()},
-                                a_ms_ks_lengths,
+                        e_device_buf_img.GetDeviceBuffer(),
-                                a_ms_ks_strides,
+                        a_ms_ks_lengths,
-                                b_ns_ks_lengths,
+                        a_ms_ks_strides,
-                                b_ns_ks_strides,
+                        b_ns_ks_lengths,
-                                std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                        b_ns_ks_strides,
-                                std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
-                                e_ms_ns_lengths,
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
-                                e_ms_ns_strides,
+                        e_ms_ns_lengths,
-                                a_element_op,
+                        e_ms_ns_strides,
-                                b_element_op,
+                        a_element_op,
-                                cde_element_op);
+                        b_element_op,
+                        cde_element_op);
    if(!op.IsSupportedArgument(argument_img2))
    {
@@ -317,7 +319,6 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
    float ave_time_img2 = invoker.Run(argument_img2, StreamConfig{nullptr, time_kernel});
    ck::index_t M =
        ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
@@ -331,9 +332,9 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
                            sizeof(DDataType) * M * N + sizeof(EDataType) * M * N * 2;
-    float ave_time = ave_time_img2 + ave_time_img1 + ave_time_re2 + ave_time_re1 ; 
+    float ave_time = ave_time_img2 + ave_time_img1 + ave_time_re2 + ave_time_re1;
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
    float gb_per_sec = num_btype / 1.E6 / ave_time;
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
@@ -343,7 +344,7 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
    e_device_buf_img.FromDevice(e_ms_ns_device_result_img.mData.data());
    auto isRealOk = 0;
-    auto isImgOk = 0;
+    auto isImgOk  = 0;
    if(do_verification)
    {
@@ -366,17 +367,16 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
        auto ref_op      = ReferenceOpInstance{};
        auto ref_invoker = ref_op.MakeInvoker();
-        auto ref_argument_re =
+        auto ref_argument_re = ref_op.MakeArgument(
-            ref_op.MakeArgument(a_ms_ks_re, b_ns_ks_re, c_ms_ns_host_result_re, a_element_op, b_element_op);
+            a_ms_ks_re, b_ns_ks_re, c_ms_ns_host_result_re, a_element_op, b_element_op);
        ref_invoker.Run(ref_argument_re);
        alpha = 1.f;
        beta  = 1.f;
        cde_element_op = CDEElementOp{alpha, beta};
        for(size_t m0 = 0; m0 < e_ms_ns_host_result_re.mDesc.GetLengths()[0]; ++m0)
        {
            for(size_t m1 = 0; m1 < e_ms_ns_host_result_re.mDesc.GetLengths()[1]; ++m1)
@@ -395,11 +395,11 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
        alpha = 1.f;
        beta  = -1.f;
        cde_element_op = CDEElementOp{alpha, beta};
-        auto ref_argument_re1 =
+        auto ref_argument_re1 = ref_op.MakeArgument(
-            ref_op.MakeArgument(a_ms_ks_img, b_ns_ks_img, c_ms_ns_host_result_re1, a_element_op, b_element_op);
+            a_ms_ks_img, b_ns_ks_img, c_ms_ns_host_result_re1, a_element_op, b_element_op);
        ref_invoker.Run(ref_argument_re1);
@@ -419,23 +419,20 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
            }
        }
-        isRealOk =  ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1;
+        isRealOk = ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1;
        // Img Part Verification
        Tensor<CShuffleDataType> c_ms_ns_host_result_img(e_ms_ns_lengths, e_ms_ns_strides);
        Tensor<CShuffleDataType> c_ms_ns_host_result_img1(e_ms_ns_lengths, e_ms_ns_strides);
-        auto ref_argument_img =
+        auto ref_argument_img = ref_op.MakeArgument(
-            ref_op.MakeArgument(a_ms_ks_re, b_ns_ks_img, c_ms_ns_host_result_img, a_element_op, b_element_op);
+            a_ms_ks_re, b_ns_ks_img, c_ms_ns_host_result_img, a_element_op, b_element_op);
        ref_invoker.Run(ref_argument_img);
        alpha = 1.f;
        beta  = 1.f;
        cde_element_op = CDEElementOp{alpha, beta};
        for(size_t m0 = 0; m0 < e_ms_ns_host_result_img.mDesc.GetLengths()[0]; ++m0)
@@ -454,9 +451,9 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
            }
        }
-        auto ref_argument_img1 =
+        auto ref_argument_img1 = ref_op.MakeArgument(
-            ref_op.MakeArgument(a_ms_ks_img, b_ns_ks_re, c_ms_ns_host_result_img1, a_element_op, b_element_op);
+            a_ms_ks_img, b_ns_ks_re, c_ms_ns_host_result_img1, a_element_op, b_element_op);
        ref_invoker.Run(ref_argument_img1);
        for(size_t m0 = 0; m0 < e_ms_ns_host_result_img.mDesc.GetLengths()[0]; ++m0)
@@ -475,7 +472,7 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
            }
        }
-        isImgOk =  ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1;
+        isImgOk = ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1;
        return (isRealOk && isImgOk);
    }

--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -113,6 +113,10 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
 #define CK_USE_AMD_MFMA_GFX940
 #endif
+#if defined(__gfx950__)
+#define CK_USE_AMD_MFMA_GFX950
+#endif
 // buffer load
 #define CK_USE_AMD_BUFFER_LOAD 1

--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -37,7 +37,18 @@ enum struct MfmaInstr
    mfma_f32_32x32x16f8bf8,
    mfma_f32_16x16x32f8bf8,
    mfma_f32_32x32x16bf8f8,
-    mfma_f32_16x16x32bf8f8
+    mfma_f32_16x16x32bf8f8,
+    // mi355 new mfma instructions
+    mfma_f32_32x32x16f16,
+    mfma_f32_16x16x32f16,
+    mfma_f32_32x32x16bf16,
+    mfma_f32_16x16x32bf16,
+    mfma_i32_32x32x32i8,
+    mfma_i32_16x16x64i8,
+    mfma_f32_32x32x64f8f6f4,
+    mfma_f32_16x16x128f8f6f4,
+    mfma_scale_f32_32x32x64f8f6f4,
+    mfma_scale_f32_16x16x128f8f6f4
 };
 template <MfmaInstr instr>
@@ -198,6 +209,50 @@ struct mfma_type<MfmaInstr::mfma_f32_32x32x8f16>
    }
 };
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x16f16>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x16f16<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x32f16>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x32f16<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_16x16x16f16>
 {
@@ -264,6 +319,28 @@ struct mfma_type<MfmaInstr::mfma_f32_4x4x4f16>
    }
 };
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x16bf16>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x16bf16<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_32x32x8bf16_1k>
 {
@@ -286,6 +363,28 @@ struct mfma_type<MfmaInstr::mfma_f32_32x32x8bf16_1k>
    }
 };
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf16>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x32bf16<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_16x16x16bf16_1k>
 {
@@ -440,6 +539,50 @@ struct mfma_type<MfmaInstr::mfma_i32_16x16x32i8>
    }
 };
+template <>
+struct mfma_type<MfmaInstr::mfma_i32_32x32x32i8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 16;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_i32_32x32x32i8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+template <>
+struct mfma_type<MfmaInstr::mfma_i32_16x16x64i8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 16;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_i32_16x16x64i8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
 template <>
 struct mfma_type<MfmaInstr::mfma_f64_16x16x4f64>
 {
@@ -638,6 +781,95 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf8f8>
    }
 };
+// TODO: fix mfma...f8f6f4 instructions
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x64f8f6f4>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x128f8f6f4>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+template <>
+struct mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+template <>
+struct mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_scale_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
 template <typename base_type,
          index_t MPerXdlops,
          index_t NPerXdlops,
@@ -713,13 +945,21 @@ struct MfmaSelector
    template <>
    constexpr auto GetMfma<half_t, 32, 32>()
    {
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        return MfmaInstr::mfma_f32_32x32x16f16;
+#else
        return MfmaInstr::mfma_f32_32x32x8f16;
+#endif
    }
    template <>
    constexpr auto GetMfma<half_t, 16, 16>()
    {
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        return MfmaInstr::mfma_f32_16x16x32f16;
+#else
        return MfmaInstr::mfma_f32_16x16x16f16;
+#endif
    }
    template <>
@@ -743,7 +983,9 @@ struct MfmaSelector
    template <>
    constexpr auto GetMfma<bhalf_t, 32, 32>()
    {
-#if defined(CK_USE_AMD_MFMA_BF16_1K_OP)
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        return MfmaInstr::mfma_f32_32x32x16bf16;
+#elif defined(CK_USE_AMD_MFMA_BF16_1K_OP)
        return MfmaInstr::mfma_f32_32x32x8bf16_1k;
 #else
        return MfmaInstr::mfma_f32_32x32x4bf16;
@@ -753,14 +995,27 @@ struct MfmaSelector
    template <>
    constexpr auto GetMfma<bhalf_t, 16, 16>()
    {
-#if defined(CK_USE_AMD_MFMA_BF16_1K_OP)
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        return MfmaInstr::mfma_f32_16x16x32bf16;
+#elif defined(CK_USE_AMD_MFMA_BF16_1K_OP)
        return MfmaInstr::mfma_f32_16x16x16bf16_1k;
 #else
        return MfmaInstr::mfma_f32_16x16x8bf16;
 #endif
    }
-#if defined(CK_USE_AMD_MFMA_GFX940)
+#if defined(CK_USE_AMD_MFMA_GFX950)
+    template <>
+    constexpr auto GetMfma<int8_t, 32, 32>()
+    {
+        return MfmaInstr::mfma_i32_32x32x32i8;
+    }
+    template <>
+    constexpr auto GetMfma<int8_t, 16, 16>()
+    {
+        return MfmaInstr::mfma_i32_16x16x64i8;
+    }
+#elif defined(CK_USE_AMD_MFMA_GFX940)
    template <>
    constexpr auto GetMfma<int8_t, 32, 32>()
    {

--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -134,6 +134,34 @@ struct intrin_mfma_f32_32x32x4f16<32, 64>
    }
 };
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x16f16;
+template <>
+struct intrin_mfma_f32_32x32x16f16<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const half8_t& reg_a, const half8_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float16_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x16_f16(
+            reg_a, reg_b, reg_c.template AsType<float16_t>()[Number<0>{}], 0, 0, 0);
+    }
+};
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x32f16;
+template <>
+struct intrin_mfma_f32_16x16x32f16<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const half8_t& reg_a, const half8_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_f16(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
+    }
+};
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f32_32x32x8f16;
@@ -204,6 +232,34 @@ struct intrin_mfma_f32_4x4x4f16<8, 64>
 };
 // bfp16
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x16bf16;
+template <>
+struct intrin_mfma_f32_32x32x16bf16<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const bhalf8_t& reg_a, const bhalf8_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float16_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x16_bf16(
+            reg_a, reg_b, reg_c.template AsType<float16_t>()[Number<0>{}], 0, 0, 0);
+    }
+};
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x32bf16;
+template <>
+struct intrin_mfma_f32_16x16x32bf16<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const bhalf8_t& reg_a, const bhalf8_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_bf16(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
+    }
+};
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f32_32x32x8bf16_1k;
@@ -298,6 +354,34 @@ struct intrin_mfma_i32_16x16x16i8<16, 16>
    }
 };
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_i32_32x32x32i8;
+template <>
+struct intrin_mfma_i32_32x32x32i8<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const int8x16_t& reg_a, const int8x16_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<int32x16_t>()(Number<0>{}) = __builtin_amdgcn_mfma_i32_32x32x32_i8(
+            reg_a, reg_b, reg_c.template AsType<int32x16_t>()[Number<0>{}], 0, 0, 0);
+    }
+};
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_i32_16x16x64i8;
+template <>
+struct intrin_mfma_i32_16x16x64i8<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const int8x16_t& reg_a, const int8x16_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<int32x4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_i32_16x16x64_i8(
+            reg_a, reg_b, reg_c.template AsType<int32x4_t>()[Number<0>{}], 0, 0, 0);
+    }
+};
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_i32_32x32x16i8;
@@ -327,12 +411,12 @@ struct intrin_mfma_i32_16x16x32i8<16, 16>
    __device__ static void Run(const int8x8_t& reg_a, const int8x8_t& reg_b, FloatC& reg_c)
    {
        reg_c.template AsType<int32x4_t>()(Number<0>{}) =
-            __builtin_amdgcn_mfma_i32_16x16x32i8(bit_cast<int64_t>(reg_a),
+            __builtin_amdgcn_mfma_i32_16x16x32_i8(bit_cast<int64_t>(reg_a),
-                                                 bit_cast<int64_t>(reg_b),
+                                                  bit_cast<int64_t>(reg_b),
-                                                 reg_c.template AsType<int32x4_t>()[Number<0>{}],
+                                                  reg_c.template AsType<int32x4_t>()[Number<0>{}],
-                                                 0,
+                                                  0,
-                                                 0,
+                                                  0,
-                                                 0);
+                                                  0);
    }
 };
@@ -356,6 +440,92 @@ struct intrin_mfma_f64_16x16x4f64<16, 16>
    }
 };
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x64f8f6f4;
+template <>
+struct intrin_mfma_f32_32x32x64f8f6f4<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const f8x8_t& reg_a, const f8x8_t& reg_b, FloatC& reg_c)
+    {
+        //#if defined(__gfx950__)
+        reg_c.template AsType<float16_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x64_f8f6f4(
+            bit_cast<long>(reg_a),
+            bit_cast<long>(reg_b),
+            reg_c.template AsType<float16_t>()[Number<0>{}],
+            0,
+            0,
+            0);
+        //#endif
+    }
+};
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_scale_f32_32x32x64f8f6f4;
+template <>
+struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const f8x8_t& reg_a, const f8x8_t& reg_b, FloatC& reg_c)
+    {
+        //#if defined(__gfx950__)
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                bit_cast<long>(reg_a),
+                bit_cast<long>(reg_b),
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+        //#endif
+    }
+};
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_scale_f32_16x16x128f8f6f4;
+template <>
+struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const f8x8_t& reg_a, const f8x8_t& reg_b, FloatC& reg_c)
+    {
+        //#if defined(__gfx950__)
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                bit_cast<long>(reg_a),
+                bit_cast<long>(reg_b),
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+        //#endif
+    }
+};
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x128f8f6f4;
+template <>
+struct intrin_mfma_f32_16x16x128f8f6f4<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const f8x8_t& reg_a, const f8x8_t& reg_b, FloatC& reg_c)
+    {
+        //#if defined(__gfx950__)
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x128_f8f6f4(
+            bit_cast<long>(reg_a),
+            bit_cast<long>(reg_b),
+            reg_c.template AsType<float4_t>()[Number<0>{}],
+            0,
+            0,
+            0);
+        //#endif
+    }
+};
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f32_32x32x16f8f8;